PyMuPDF合并PDF并添加目录、页脚和书签

Python 城市风 8/1/2024 257 次 0 条

前面使用pypdf和reportlab库来进行PDF合并、添加目录、页脚和书签等功能。下面我们将使用PyMuPDF来进行操作。

重点解决了前面文章https://www.csfplus.com/post/py-merge-pdf.html 存在的问题,

1)将所有横向页面切换成纵向页面;

2)将页面缩放成A4大小方便打印;

3)添加目录页,如果目录页是奇数的话,添加一页空白页,方便双面打印;

4)文件合并根据现有的目录和文件结构进行组织

5)文件根据文件名排序,自行命名。

PyMuPDF 文档:https://pymupdf.readthedocs.io/


import os
import re
import pymupdf

class MergePDF:
    def __init__(self, folder):
        self.folder = folder
        self.pdfs = []
        self._get_pdfs(folder)
        self.doc = pymupdf.open()

    def _get_pdfs(self, root_dir,level=1, isall = False):
        lst = os.listdir(root_dir)
        lst.sort(key=lambda filename:tuple(int(part) for part in re.findall(r'\d+', filename)))
        for name in lst:
            path = os.path.join(root_dir, name)
            data ={'name' :  name , 'path' : path, 'isdir' : False, 'level':level}
            if os.path.isdir(path):
                data['isdir'] = True
                self.pdfs.append(data)
                self._get_pdfs(path, level+1, True)
            elif isall:
                name, ext = os.path.splitext(name)
                if ext.lower() == '.pdf':
                    data['name'] = name
                    self.pdfs.append(data)
                    
    def _merge_pdfs(self):
        total_pages = 1
        for item in self.pdfs:
            item['page_number'] = total_pages
            if item['isdir']:
                item['page_number'] = total_pages
            else:                
                with pymupdf.open(item['path']) as mfile:                    
                    total_pages += mfile.page_count
                    for page in mfile:
                        #横向切换为纵向
                        if page.rect.width > page.rect.height:
                            page.set_rotation(90)
                            
                        #重置旋转角度为0
                        if page.rotation > 0:
                            page.remove_rotation()
                            
                        # 转换为A4页面大小
                        a4p = self.doc.new_page()
                        a4p.show_pdf_page(a4p.rect, mfile, page.number)
                        
                    #self.doc.insert_pdf(mfile)

    def _add_bookmarks(self, offset=0, is_toc = True):
        bookmarks = []
        if is_toc:
            bookmarks.append([1,'目录',1])
        for item in self.pdfs:
            bookmarks.append([item['level'], item['name'], item['page_number']+offset])
        self.doc.set_toc(bookmarks)
        
    def _split_text_into_lines(self, text, font, font_size, max_width):  
        lines = []  
        current_line = ""  
        words = []  
        i = 0  
        while i < len(text):  
            # 如果当前字符是英文,尝试找到整个英文单词  
            if text[i].isalpha() and text[i].islower() or text[i].isupper():  
                word_start = i  
                while i < len(text) and (text[i].isalpha() or text[i].isdigit()):  
                    i += 1  
                words.append(text[word_start:i])  
                continue  
            else:  
                # 对于非英文单词字符,直接作为单独一个“单词”处理  
                words.append(text[i])  
                i += 1  
          
        # 将单词组合成行  
        for word in words:  
            if font.text_length(current_line + word, font_size) <= max_width:  
                current_line += word  
            else:  
                lines.append(current_line)  
                current_line = word  
          
        if current_line:  
            lines.append(current_line)  
          
        return lines

    def _add_catalog_page(self):
        page_index = 0
        width, height = pymupdf.paper_size("a4")  # 获取页面宽度和高度
        toc_page = self.doc.new_page(page_index) # 新建一个页面作为目录页
        
        font=pymupdf.Font("cjk")
        font_size = 12
        toc_page.insert_font(fontname="F0", fontbuffer=font.buffer)
        
        top_margin = 60                         # 设置顶部边距
        bottom_margin = height - 60      # 设置底部边距
        y_position =  top_margin            # 计算初始y坐标位置
        left_margin = 72  # 左边距
        right_margin = width - 72  # 右边距
        dot_space = 5  # 点之间的间隔
        different_title_spacing = 25  # 不同标题之间的间隔
        same_title_line_spacing = 15  # 同一个标题换行的间隔
        indent = 15
        split_ratio = 0.97  # 分割点的位置比例
        available_width = right_margin - left_margin - dot_space * 2  # 计算可用宽度
        
        toc_page.insert_text((280, y_position), "目录", fontname="F0", fontsize=16) # 绘制目录标题 
        y_position += 30                                        # 更新y坐标位置以为目录项留出空间
   
        for item in self.pdfs:
            prefix = indent*(item['level']-1)
            title = f"{item['name']}"
            
            title_width = font.text_length(title, font_size)            
            page_number_str = str(item['page_number'])
            page_number_width = font.text_length(page_number_str, font_size)

            real_with = (available_width - prefix) * split_ratio

            # 判断标题是否需要分割
            if title_width > real_with:
                lines = self._split_text_into_lines(title, font, font_size, real_with)
                for idx, line in enumerate(lines, start=1) :
                    toc_page.insert_text((left_margin + prefix, y_position), line, fontname="F0", fontsize=font_size)
                    if idx == len(lines):
                        title = line
                    else:
                       y_position += same_title_line_spacing            
            else:
                # 如果不需要分割标题,直接绘制
                 toc_page.insert_text((left_margin + prefix , y_position), title, fontname="F0", fontsize=font_size)

            toc_page.insert_text((right_margin - page_number_width, y_position), page_number_str, fontname="F0", fontsize=font_size)

            # 绘制点线连接标题和页码
            dot_line_start = left_margin + prefix + font.text_length(title, font_size) + 10
            dot_line_end = right_margin - page_number_width  - 10
            current_position = dot_line_start

            while current_position < dot_line_end:
                toc_page.insert_text((current_position, y_position), ".", fontname="F0", fontsize=font_size)
                current_position += dot_space

            y_position += different_title_spacing
            
            if y_position > bottom_margin:          # 检查是否需要翻页
                page_index += 1
                toc_page =self.doc.new_page(page_index) 
                y_position = top_margin                 # 重置y坐标,考虑顶部边距
                toc_page.insert_font(fontname="F0", fontbuffer=font.buffer)

        if page_index % 2 == 0:                         # 确保页数为偶数页,方便双面打印
            page_index += 1
            toc_page =self.doc.new_page(page_index)
        
        return page_index+1

    def _int_to_roman(self, num):  
        val = [10, 9, 5, 4, 1 ]  
        sym = ["X", "IX", "V", "IV",  "I"  ]  
        roman_num = ''  
        i = 0  
        while  num > 0:  
            for _ in range(num // val[i]):  
                roman_num += sym[i]  
                num -= val[i]  
            i += 1  
        return roman_num

    def _add_page_numbers(self, numformat=None, start=1, end=None):
        page_number =0
        for idx, page in enumerate(self.doc, start=1):
            if idx <= start -1: continue
            page_number += 1
            if numformat != None:
                text = f'{numformat(page_number)}'
            else:
                text = f"{page_number}"
                
            font_size = 12
            font = pymupdf.Font("tibo")
            page_number_width = font.text_length(text, font_size)
            x = (page.rect.width - page_number_width) / 2
            page.insert_text((x, page.rect.height - 20), text, fontsize=font_size, fontname=font.name, fill=(0, 0, 0))
            if end != None and idx >= end: break
  
        
    def save(self, output_pdf_path):        
        self._merge_pdfs()        
        offset = self._add_catalog_page()
        self._add_bookmarks(offset)

        self._add_page_numbers(self._int_to_roman, end = offset)
        self._add_page_numbers(start = offset+1)
        
        self.doc.save(output_pdf_path)
        self.doc.close()


if __name__ == '__main__':
    pdf = MergePDF(os.getcwd())
    print('正在合并文件中……')
    pdf.save('佐证材料.pdf')
    print('Ok')