Python 批量合并 pdf

"""
合并多个 pdf 到一个,使用方式:放到当前文件夹下运行。 pyPdf 库有点老了
"""

import os.path

from pyPdf import PdfFileReader, PdfFileWriter # pip install pyPdf


def get_pdf_files(dst_dir):
    paths = []
    for root, dirs, files in os.walk(dst_dir):
        for filespath in files:
            if filespath.endswith('.pdf'):  # pdf file
                abspath = os.path.join(root, filespath)
                paths.append(abspath)
    return paths


##########################合并同一个文件夹下所有PDF文件########################
def merge_pdf(dst_dir, outfile, sort=True):
    output = PdfFileWriter()
    curpage = 0
    pdf_paths = sorted(get_pdf_files(dst_dir)) if sort else get_pdf_files(dst_dir)
    for each in pdf_paths:
        print(each)
        reader = PdfFileReader(file(each, "rb"))
        # 如果pdf文件已经加密,必须首先解密才能使用pyPdf
        if reader.isEncrypted == True:
            reader.decrypt("map")
        # 获得源pdf文件中页面总数
        page_count = reader.getNumPages()
        curpage += page_count
        print(page_count)

        for iPage in range(0, page_count):
            output.addPage(reader.getPage(iPage))

    print("All Pages Number:" + str(curpage))
    outputStream = file(dst_dir + outfile, "wb")
    output.write(outputStream)
    outputStream.close()


def main():
    merged = "all.pdf"
    merge_pdf("./", merged)


if __name__ == '__main__':
    main()

推荐下边这个 pikepdf 库来批量操作 pdf,pyPdf 库挺久没有更新了

# -*- coding: utf-8 -*-

import os.path
from pikepdf import Pdf, OutlineItem  # pip install pikepdf


def get_pdf_files(dst_dir):
    paths = []
    for root, dirs, files in os.walk(dst_dir):
        for filespath in files:
            if filespath.endswith('.pdf'):  # pdf file
                abspath = os.path.join(root, filespath)
                paths.append(abspath)
    return paths


def merge_pdf(dst_dir, outfile, sort=True):
    # https://pikepdf.readthedocs.io/en/latest/topics/pages.html#merge-concatenate-pdf-from-several-pdfs
    pdf_paths = sorted(get_pdf_files(dst_dir)) if sort else get_pdf_files(dst_dir)
    pdf = Pdf.new()
    page_count = 0
    with pdf.open_outline() as outline:
        for path in pdf_paths:
            src = Pdf.open(path)
            print("merging:" + src.filename)  # 打印一下进度
            oi = OutlineItem(os.path.basename(src.filename), page_count)  # 增加目录
            outline.root.append(oi)
            page_count += len(src.pages)
            pdf.pages.extend(src.pages)
    pdf.save(outfile)


def main():
    merged = "all.pdf"
    merge_pdf("./", merged)


if __name__ == '__main__':
    main()