前面使用pypdf和reportlab库来进行PDF合并、添加目录、页脚和书签等功能。下面我们将使用PyMuPDF来进行操作。
重点解决了前面文章https://www.csfplus.com/post/py-merge-pdf.html 存在的问题,
1)将所有横向页面切换成纵向页面;
2)将页面缩放成A4大小方便打印;
3)添加目录页,如果目录页是奇数的话,添加一页空白页,方便双面打印;
4)文件合并根据现有的目录和文件结构进行组织
5)文件根据文件名排序,自行命名。
PyMuPDF 文档:https://pymupdf.readthedocs.io/
import os import re import pymupdf class MergePDF: def __init__(self, folder): self.folder = folder self.pdfs = [] self._get_pdfs(folder) self.doc = pymupdf.open() def _get_pdfs(self, root_dir,level=1, isall = False): lst = os.listdir(root_dir) lst.sort(key=lambda filename:tuple(int(part) for part in re.findall(r'\d+', filename))) for name in lst: path = os.path.join(root_dir, name) data ={'name' : name , 'path' : path, 'isdir' : False, 'level':level} if os.path.isdir(path): data['isdir'] = True self.pdfs.append(data) self._get_pdfs(path, level+1, True) elif isall: name, ext = os.path.splitext(name) if ext.lower() == '.pdf': data['name'] = name self.pdfs.append(data) def _merge_pdfs(self): total_pages = 1 for item in self.pdfs: item['page_number'] = total_pages if item['isdir']: item['page_number'] = total_pages else: with pymupdf.open(item['path']) as mfile: total_pages += mfile.page_count for page in mfile: #横向切换为纵向 if page.rect.width > page.rect.height: page.set_rotation(90) #重置旋转角度为0 if page.rotation > 0: page.remove_rotation() # 转换为A4页面大小 a4p = self.doc.new_page() a4p.show_pdf_page(a4p.rect, mfile, page.number) #self.doc.insert_pdf(mfile) def _add_bookmarks(self, offset=0, is_toc = True): bookmarks = [] if is_toc: bookmarks.append([1,'目录',1]) for item in self.pdfs: bookmarks.append([item['level'], item['name'], item['page_number']+offset]) self.doc.set_toc(bookmarks) def _split_text_into_lines(self, text, font, font_size, max_width): lines = [] current_line = "" words = [] i = 0 while i < len(text): # 如果当前字符是英文,尝试找到整个英文单词 if text[i].isalpha() and text[i].islower() or text[i].isupper(): word_start = i while i < len(text) and (text[i].isalpha() or text[i].isdigit()): i += 1 words.append(text[word_start:i]) continue else: # 对于非英文单词字符,直接作为单独一个“单词”处理 words.append(text[i]) i += 1 # 将单词组合成行 for word in words: if font.text_length(current_line + word, font_size) <= max_width: current_line += word else: lines.append(current_line) current_line = word if current_line: lines.append(current_line) return lines def _add_catalog_page(self): page_index = 0 width, height = pymupdf.paper_size("a4") # 获取页面宽度和高度 toc_page = self.doc.new_page(page_index) # 新建一个页面作为目录页 font=pymupdf.Font("cjk") font_size = 12 toc_page.insert_font(fontname="F0", fontbuffer=font.buffer) top_margin = 60 # 设置顶部边距 bottom_margin = height - 60 # 设置底部边距 y_position = top_margin # 计算初始y坐标位置 left_margin = 72 # 左边距 right_margin = width - 72 # 右边距 dot_space = 5 # 点之间的间隔 different_title_spacing = 25 # 不同标题之间的间隔 same_title_line_spacing = 15 # 同一个标题换行的间隔 indent = 15 split_ratio = 0.97 # 分割点的位置比例 available_width = right_margin - left_margin - dot_space * 2 # 计算可用宽度 toc_page.insert_text((280, y_position), "目录", fontname="F0", fontsize=16) # 绘制目录标题 y_position += 30 # 更新y坐标位置以为目录项留出空间 for item in self.pdfs: prefix = indent*(item['level']-1) title = f"{item['name']}" title_width = font.text_length(title, font_size) page_number_str = str(item['page_number']) page_number_width = font.text_length(page_number_str, font_size) real_with = (available_width - prefix) * split_ratio # 判断标题是否需要分割 if title_width > real_with: lines = self._split_text_into_lines(title, font, font_size, real_with) for idx, line in enumerate(lines, start=1) : toc_page.insert_text((left_margin + prefix, y_position), line, fontname="F0", fontsize=font_size) if idx == len(lines): title = line else: y_position += same_title_line_spacing else: # 如果不需要分割标题,直接绘制 toc_page.insert_text((left_margin + prefix , y_position), title, fontname="F0", fontsize=font_size) toc_page.insert_text((right_margin - page_number_width, y_position), page_number_str, fontname="F0", fontsize=font_size) # 绘制点线连接标题和页码 dot_line_start = left_margin + prefix + font.text_length(title, font_size) + 10 dot_line_end = right_margin - page_number_width - 10 current_position = dot_line_start while current_position < dot_line_end: toc_page.insert_text((current_position, y_position), ".", fontname="F0", fontsize=font_size) current_position += dot_space y_position += different_title_spacing if y_position > bottom_margin: # 检查是否需要翻页 page_index += 1 toc_page =self.doc.new_page(page_index) y_position = top_margin # 重置y坐标,考虑顶部边距 toc_page.insert_font(fontname="F0", fontbuffer=font.buffer) if page_index % 2 == 0: # 确保页数为偶数页,方便双面打印 page_index += 1 toc_page =self.doc.new_page(page_index) return page_index+1 def _int_to_roman(self, num): val = [10, 9, 5, 4, 1 ] sym = ["X", "IX", "V", "IV", "I" ] roman_num = '' i = 0 while num > 0: for _ in range(num // val[i]): roman_num += sym[i] num -= val[i] i += 1 return roman_num def _add_page_numbers(self, numformat=None, start=1, end=None): page_number =0 for idx, page in enumerate(self.doc, start=1): if idx <= start -1: continue page_number += 1 if numformat != None: text = f'{numformat(page_number)}' else: text = f"{page_number}" font_size = 12 font = pymupdf.Font("tibo") page_number_width = font.text_length(text, font_size) x = (page.rect.width - page_number_width) / 2 page.insert_text((x, page.rect.height - 20), text, fontsize=font_size, fontname=font.name, fill=(0, 0, 0)) if end != None and idx >= end: break def save(self, output_pdf_path): self._merge_pdfs() offset = self._add_catalog_page() self._add_bookmarks(offset) self._add_page_numbers(self._int_to_roman, end = offset) self._add_page_numbers(start = offset+1) self.doc.save(output_pdf_path) self.doc.close() if __name__ == '__main__': pdf = MergePDF(os.getcwd()) print('正在合并文件中……') pdf.save('佐证材料.pdf') print('Ok')