前面使用pypdf和reportlab库来进行PDF合并、添加目录、页脚和书签等功能。下面我们将使用PyMuPDF来进行操作。
重点解决了前面文章https://www.csfplus.com/post/py-merge-pdf.html 存在的问题,
1)将所有横向页面切换成纵向页面;
2)将页面缩放成A4大小方便打印;
3)添加目录页,如果目录页是奇数的话,添加一页空白页,方便双面打印;
4)文件合并根据现有的目录和文件结构进行组织
5)文件根据文件名排序,自行命名。
PyMuPDF 文档:https://pymupdf.readthedocs.io/
import os
import re
import pymupdf
class MergePDF:
def __init__(self, folder):
self.folder = folder
self.pdfs = []
self._get_pdfs(folder)
self.doc = pymupdf.open()
def _get_pdfs(self, root_dir,level=1, isall = False):
lst = os.listdir(root_dir)
lst.sort(key=lambda filename:tuple(int(part) for part in re.findall(r'\d+', filename)))
for name in lst:
path = os.path.join(root_dir, name)
data ={'name' : name , 'path' : path, 'isdir' : False, 'level':level}
if os.path.isdir(path):
data['isdir'] = True
self.pdfs.append(data)
self._get_pdfs(path, level+1, True)
elif isall:
name, ext = os.path.splitext(name)
if ext.lower() == '.pdf':
data['name'] = name
self.pdfs.append(data)
def _merge_pdfs(self):
total_pages = 1
for item in self.pdfs:
item['page_number'] = total_pages
if item['isdir']:
item['page_number'] = total_pages
else:
with pymupdf.open(item['path']) as mfile:
total_pages += mfile.page_count
for page in mfile:
#横向切换为纵向
if page.rect.width > page.rect.height:
page.set_rotation(90)
#重置旋转角度为0
if page.rotation > 0:
page.remove_rotation()
# 转换为A4页面大小
a4p = self.doc.new_page()
a4p.show_pdf_page(a4p.rect, mfile, page.number)
#self.doc.insert_pdf(mfile)
def _add_bookmarks(self, offset=0, is_toc = True):
bookmarks = []
if is_toc:
bookmarks.append([1,'目录',1])
for item in self.pdfs:
bookmarks.append([item['level'], item['name'], item['page_number']+offset])
self.doc.set_toc(bookmarks)
def _split_text_into_lines(self, text, font, font_size, max_width):
lines = []
current_line = ""
words = []
i = 0
while i < len(text):
# 如果当前字符是英文,尝试找到整个英文单词
if text[i].isalpha() and text[i].islower() or text[i].isupper():
word_start = i
while i < len(text) and (text[i].isalpha() or text[i].isdigit()):
i += 1
words.append(text[word_start:i])
continue
else:
# 对于非英文单词字符,直接作为单独一个“单词”处理
words.append(text[i])
i += 1
# 将单词组合成行
for word in words:
if font.text_length(current_line + word, font_size) <= max_width:
current_line += word
else:
lines.append(current_line)
current_line = word
if current_line:
lines.append(current_line)
return lines
def _add_catalog_page(self):
page_index = 0
width, height = pymupdf.paper_size("a4") # 获取页面宽度和高度
toc_page = self.doc.new_page(page_index) # 新建一个页面作为目录页
font=pymupdf.Font("cjk")
font_size = 12
toc_page.insert_font(fontname="F0", fontbuffer=font.buffer)
top_margin = 60 # 设置顶部边距
bottom_margin = height - 60 # 设置底部边距
y_position = top_margin # 计算初始y坐标位置
left_margin = 72 # 左边距
right_margin = width - 72 # 右边距
dot_space = 5 # 点之间的间隔
different_title_spacing = 25 # 不同标题之间的间隔
same_title_line_spacing = 15 # 同一个标题换行的间隔
indent = 15
split_ratio = 0.97 # 分割点的位置比例
available_width = right_margin - left_margin - dot_space * 2 # 计算可用宽度
toc_page.insert_text((280, y_position), "目录", fontname="F0", fontsize=16) # 绘制目录标题
y_position += 30 # 更新y坐标位置以为目录项留出空间
for item in self.pdfs:
prefix = indent*(item['level']-1)
title = f"{item['name']}"
title_width = font.text_length(title, font_size)
page_number_str = str(item['page_number'])
page_number_width = font.text_length(page_number_str, font_size)
real_with = (available_width - prefix) * split_ratio
# 判断标题是否需要分割
if title_width > real_with:
lines = self._split_text_into_lines(title, font, font_size, real_with)
for idx, line in enumerate(lines, start=1) :
toc_page.insert_text((left_margin + prefix, y_position), line, fontname="F0", fontsize=font_size)
if idx == len(lines):
title = line
else:
y_position += same_title_line_spacing
else:
# 如果不需要分割标题,直接绘制
toc_page.insert_text((left_margin + prefix , y_position), title, fontname="F0", fontsize=font_size)
toc_page.insert_text((right_margin - page_number_width, y_position), page_number_str, fontname="F0", fontsize=font_size)
# 绘制点线连接标题和页码
dot_line_start = left_margin + prefix + font.text_length(title, font_size) + 10
dot_line_end = right_margin - page_number_width - 10
current_position = dot_line_start
while current_position < dot_line_end:
toc_page.insert_text((current_position, y_position), ".", fontname="F0", fontsize=font_size)
current_position += dot_space
y_position += different_title_spacing
if y_position > bottom_margin: # 检查是否需要翻页
page_index += 1
toc_page =self.doc.new_page(page_index)
y_position = top_margin # 重置y坐标,考虑顶部边距
toc_page.insert_font(fontname="F0", fontbuffer=font.buffer)
if page_index % 2 == 0: # 确保页数为偶数页,方便双面打印
page_index += 1
toc_page =self.doc.new_page(page_index)
return page_index+1
def _int_to_roman(self, num):
val = [10, 9, 5, 4, 1 ]
sym = ["X", "IX", "V", "IV", "I" ]
roman_num = ''
i = 0
while num > 0:
for _ in range(num // val[i]):
roman_num += sym[i]
num -= val[i]
i += 1
return roman_num
def _add_page_numbers(self, numformat=None, start=1, end=None):
page_number =0
for idx, page in enumerate(self.doc, start=1):
if idx <= start -1: continue
page_number += 1
if numformat != None:
text = f'{numformat(page_number)}'
else:
text = f"{page_number}"
font_size = 12
font = pymupdf.Font("tibo")
page_number_width = font.text_length(text, font_size)
x = (page.rect.width - page_number_width) / 2
page.insert_text((x, page.rect.height - 20), text, fontsize=font_size, fontname=font.name, fill=(0, 0, 0))
if end != None and idx >= end: break
def save(self, output_pdf_path):
self._merge_pdfs()
offset = self._add_catalog_page()
self._add_bookmarks(offset)
self._add_page_numbers(self._int_to_roman, end = offset)
self._add_page_numbers(start = offset+1)
self.doc.save(output_pdf_path)
self.doc.close()
if __name__ == '__main__':
pdf = MergePDF(os.getcwd())
print('正在合并文件中……')
pdf.save('佐证材料.pdf')
print('Ok')