pypdf合并PDF并添加目录、页脚和书签

Python 城市风 7/6/2024 1596 次 0 条

在处理文档时,经常需要合并指定目录下多个目录下多个PDF文件并添加目录、页脚和书签的情况。本文将介绍如何使用pypdf和reportlab库来实现这一功能。

1.准备环境

首先,确保你的环境中安装了pypdf和reportlab库。

pip install pypdf reportlab

2.功能介绍

将要合并的PDF根据不同类型建立相应的文件夹,将相关附件按顺序存放各自文件夹中,如下图所示

文件列表.fw.png 

运用代码后,生成的PDF如下所示,会将指定文件下的所有PDF文件进行合并,并生成目录页,每页添加页码页以及书签。

生成PDF效果图.png


3.整体代码

import re
import os
import io
from pypdf import PdfReader,PdfWriter,PaperSize,Transformation

from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.pdfmetrics import stringWidth

pdfmetrics.registerFont(TTFont('msyhbd', 'msyhbd.ttc'))
pdfmetrics.registerFont(TTFont('msyh', 'msyh.ttc'))
pdfmetrics.registerFont(TTFont('times', 'times.ttf'))
pdfmetrics.registerFont(TTFont('timesbd', 'timesbd.ttf'))

class MergePDF:
    def __init__(self, folder):
        self.folder = folder
        self.pdfs = []
        self.get_pdfs(folder)
        self.writer = PdfWriter()

    def get_pdfs(self, root_dir, level = 1, isall = False):
        lst = os.listdir(root_dir)
        lst.sort(key=lambda filename:tuple(int(part) for part in re.findall(r'\d+', filename)))
        for name in lst:
            path = os.path.join(root_dir, name)
            data ={'name' :  name , 'path' : path, 'isdir' : False, 'level':level}
            if os.path.isdir(path):
                data['isdir'] = True
                self.pdfs.append(data)
                self.get_pdfs(path, level+1, True)
            elif isall:
                name, ext = os.path.splitext(name)
                if ext.lower() == '.pdf':
                    data['name'] = name
                    self.pdfs.append(data)

    def create_footer_page(self, footer_text):
        packet = io.BytesIO()
        c = canvas.Canvas(packet, pagesize=A4)
        width, height = A4
        font_name = 'timesbd'
        font_size = 12
        cover_height = font_size + 4 
        cover_y_position = 28
        
        # 计算文本宽度和起始X位置以居中文本
        text_width = c.stringWidth(footer_text, font_name, font_size)
        text_start_position = (width - text_width) / 2
        '''
         # 绘制一个足够大的白色矩形以覆盖原有页码
        c.setFillColorRGB(1, 1, 1)  # 设置填充颜色为白色
        c.rect(0, cover_y_position, width, cover_height, stroke=False, fill=True)
        '''
        # 在页脚区域居中添加文本,高度可以根据需要调整
        c.setFont(font_name, font_size)  # 设置字体和大小
        c.setFillColorRGB(0, 0, 0)  # 设置文本颜色为黑色
        c.drawString(text_start_position, 32, footer_text)  # 绘制居中的页脚文本

        c.save()
        packet.seek(0)
        return PdfReader(packet)
    
    def add_catalog_page(self):
        packet = io.BytesIO()
        c = canvas.Canvas(packet, pagesize=A4)
        width, height = A4  # 获取页面宽度和高度
        top_margin = 60  # 设置顶部边距
        bottom_margin = 60  # 设置底部边距
        y_position = height - top_margin  # 计算初始y坐标位置,考虑顶部边距
        c.setFont("msyhbd", 16)  # 设置目录标题的字体和大小
        c.drawString(280, y_position, "目录")  # 绘制目录标题
        y_position -= 30  # 更新y坐标位置以为目录项留出空间

        left_margin = 72  # 左边距
        right_margin = width - 72  # 右边距
        dot_space = 5  # 点之间的间隔
        different_title_spacing = 25  # 不同标题之间的间隔
        same_title_line_spacing = 15  # 同一个标题换行的间隔
        indent = 5
        split_ratio = 0.97  # 分割点的位置比例

        for item in self.pdfs:
            prefix = indent*(item['level']-1)
            title = f"{item['name']}"

            available_width = right_margin - left_margin - dot_space * 2  # 计算可用宽度
            page_number_font = 'timesbd' if item['isdir'] else 'times'
            title_font = 'msyhbd' if item['isdir'] else 'msyh'
            c.setFont(title_font, 12)  # 设置目录项的字体和大小
            
            title_width = stringWidth(title, title_font, 12)            
            page_number_str = str(item['page_number']+1)
            page_number_width = stringWidth(page_number_str, page_number_font, 12)

            # 判断标题是否需要分割
            if title_width > available_width * split_ratio:
                split_title = title
                # 寻找合适的分割点
                while stringWidth(split_title, title_font, 12) > available_width * split_ratio:
                    split_title = split_title[:-1]

                c.drawString(left_margin + prefix, y_position, split_title)
                y_position -= same_title_line_spacing  # 为分割后的标题调整y坐标
                title = title[len(split_title) - 1:]
                c.drawString(left_margin + prefix , y_position, title)  # 绘制续行部分标题
            else:
                # 如果不需要分割标题,直接绘制
                c.drawString(left_margin + prefix, y_position, title)

            c.drawRightString(right_margin, y_position, page_number_str)  # 绘制页码

            # 绘制点线连接标题和页码
            dot_line_start = left_margin + stringWidth(title, title_font, 12) + 10
            dot_line_end = right_margin - page_number_width - 10
            current_position = dot_line_start

            while current_position < dot_line_end:
                c.drawString(current_position, y_position, ".")
                current_position += dot_space

            y_position -= different_title_spacing  # 更新y坐标以为下一个标题留出空间
            if y_position < bottom_margin:  # 检查是否需要翻页
                c.showPage()
                y_position = height - top_margin  # 重置y坐标,考虑顶部边距
                c.setFont(title_font, 12)  # 确保新页面使用正确的字体和大小

        c.save()  # 保存PDF到内存中的字节包
        packet.seek(0)  # 将字节包的指针重置到开始位置
        return PdfReader(packet)  # 创建并返回一个PDF阅读器对象,包含内存中的PDF数据

    def extract_pages(self):
        '''提取所有PDF文件页'''
        self.all_pages = []
        total_pages = 0
        for item in self.pdfs:
            item['page_number'] = total_pages
            if item['isdir']:
                item['page_number'] = total_pages
            else:                
                reader = PdfReader(item['path'])
                for page in reader.pages:
                    self.all_pages.append(page)
                    total_pages += 1

    def add_bookmarks(self, offset):
        parent_bookmarks = {0: None}
        for item in self.pdfs:
            if item['isdir']:
                parent_bookmarks[item['level']] =  self.writer.add_outline_item(item['name'], item['page_number']+offset)
            else:
                self.writer.add_outline_item(item['name'],  item['page_number']+offset, parent = parent_bookmarks[item['level']-1] )

    def  int_to_roman(self, num):  
        values = [10, 9, 5, 4, 1]  
        symbols = ["X", "IX", "V", "IV", "I"]  
        roman_num = ''  
        i = 0  
        while num > 0:  
            for _ in range(num // values[i]):  
                roman_num += symbols[i]  
                num -= values[i]  
            i += 1  
        return roman_num
    
    def save(self, output_pdf_path):
        '''保存页面'''
        self.extract_pages()
        
        current_page_number = 1        
        # 创建目录页并添加到最终PDF
        catalog_pdf = self.add_catalog_page()
        for page in catalog_pdf.pages:
            footer_pdf = self.create_footer_page(f"{self.int_to_roman(current_page_number)}")
            page.merge_page(footer_pdf.pages[0])
            self.writer.add_page(page)
            current_page_number += 1

        # 为每页添加页脚,并将页面添加到最终的PDF中
        current_page_number = 1  # 从目录页之后的第一页开始计数页码
        catalog_pages_count = len(catalog_pdf.pages)  # 计算目录页数量

        for page in self.all_pages:
            # 不再需要在页码中加上目录页的数量
            footer_pdf = self.create_footer_page(f"{current_page_number}")
            page.merge_page(footer_pdf.pages[0])
            self.writer.add_page(page)
            current_page_number += 1

        # 添加书签
        self.add_bookmarks(catalog_pages_count)

        # 保存最终的PDF
        with open(output_pdf_path, "wb") as f_out:
            self.writer.write(f_out)


if __name__ == '__main__':
    pdf = MergePDF(os.getcwd())
    print('正在合并文件中……')
    pdf.save('佐证材料.pdf')
    print('Ok')
    

4.存在问题

a) 如果页面是横向版式,或者发生旋转,底部页码的标注位置存在问题;

b) 保持原有页面大小,暂时不能统一缩小成A4大小;

c) 只能合并PDF,不能合并非PDF文件以外的任何文件;

d) 按照文件命名排序,如果文件数量超过10个,排序有问题。比如“2.10 XXXX",不是排到2.9后面,而是排在2.1后面,2.2前面。(已解决)


前三个问题,可以转换成PDF,在PDF编辑软件中先编辑好,比如纵向布局,调整页面大小等。