forked from Dawn_Ocean/ZJUEVA-Reimburse
119 lines
4.2 KiB
Python
119 lines
4.2 KiB
Python
from docx import Document
|
||
from docx.shared import Inches
|
||
|
||
from PIL import Image
|
||
|
||
from copy import copy
|
||
|
||
import fitz
|
||
|
||
from docx2pdf import convert
|
||
|
||
import os
|
||
|
||
def convert_all(path = os.path.abspath('.')):
|
||
print("正在扫描并处理文件中...")
|
||
items = os.listdir(path)
|
||
for item in items:
|
||
if os.path.isdir(item):
|
||
result = convert_img(path + '\\' + item)
|
||
if result == 1:
|
||
print("PDF 转换失败!")
|
||
return
|
||
elif result == 2:
|
||
print("JPG/JPEG 转换失败!")
|
||
return
|
||
print("转换完毕!")
|
||
|
||
"""传入绝对路径"""
|
||
def convert_img(path = os.path.abspath('.')):
|
||
files = os.listdir(path)
|
||
for file in files:
|
||
if file.endswith('.pdf'):
|
||
result = pdf2img(path, file)
|
||
if result: # 1 -> Error
|
||
return 1
|
||
elif file.endswith('.jpg') or file.endswith('.jpeg'):
|
||
img = Image.open(path + '\\' + file)
|
||
img.save(path + '\\' + file.split('.')[0] + ".png", "PNG")
|
||
|
||
|
||
def gen_filelist(path = os.path.abspath('.')):
|
||
print("创建文件夹列表中...")
|
||
dir_list = []
|
||
item_list = os.listdir('.')
|
||
for item in item_list:
|
||
if os.path.isdir(item):
|
||
dir_list.append(item)
|
||
file_list = {}
|
||
for dir in dir_list:
|
||
file_list[dir] = os.listdir(path + '\\' + dir)
|
||
for dir, dir_file in file_list.items():
|
||
png_count = 0
|
||
dir_file_copy = copy(dir_file)
|
||
for file in dir_file_copy:
|
||
if ".png" in file:
|
||
png_count += 1
|
||
else:
|
||
dir_file.remove(file)
|
||
if png_count != 3:
|
||
print(f"在{dir}文件夹发现错误:文件个数不符")
|
||
return
|
||
print("创建完毕!将要加入文档的文件如下:")
|
||
for dir, dir_file in file_list.items():
|
||
print('- ' + dir)
|
||
print(' ', end = '')
|
||
for file in dir_file:
|
||
print(file, end = ", ")
|
||
print()
|
||
print(f"共有 {len(list(file_list.items()))} 组文件")
|
||
return file_list
|
||
|
||
def gen_docx(path = os.path.abspath('.')):
|
||
while True:
|
||
doc = Document()
|
||
file_list = gen_filelist()
|
||
input("按回车键确认...")
|
||
print("生成 .docx 文档中...")
|
||
if len(file_list) != 0:
|
||
for dir, dir_file in file_list.items():
|
||
parent_path = path + '\\' + dir + '\\'
|
||
for file in dir_file:
|
||
if file[0:3] == "pdf":
|
||
dir_file.remove(file)
|
||
dir_file.insert(0, file)
|
||
doc.add_picture(parent_path + dir_file[0], height = Inches(2.5))
|
||
table = doc.add_table(rows = 1, cols = 2)
|
||
cell1 = table.cell(0, 0)
|
||
cell1.paragraphs[0].add_run().add_picture(parent_path + dir_file[1], height = Inches(5.0))
|
||
cell2 = table.cell(0, 1)
|
||
cell2.paragraphs[0].add_run().add_picture(parent_path + dir_file[2], height = Inches(5.0))
|
||
if (dir, dir_file) != list(file_list.items())[-1]: # dict.items() 返回值需先转换为列表,才能索引
|
||
doc.add_page_break()
|
||
else:
|
||
print("请检查文件命名是否正确!")
|
||
continue
|
||
doc.save("output.docx")
|
||
print("生成完毕!")
|
||
break
|
||
|
||
def pdf2img(pdf_path, pdf_name, zoom_x = 3, zoom_y = 3):
|
||
doc = fitz.open(pdf_path + "\\" + pdf_name) # 打开文档
|
||
if len(doc) != 1:
|
||
print("PDF 文件只能包含一页!")
|
||
return 1
|
||
for page in doc: # 遍历页面
|
||
pix = page.get_pixmap(matrix=fitz.Matrix(zoom_x, zoom_y)) # 将页面渲染为图片
|
||
pix.save(pdf_path + '\\' + "pdf" + pdf_name[:-4] + ".png") # 将图像存储为PNG格式
|
||
doc.close() # 关闭文档
|
||
|
||
if __name__ == "__main__":
|
||
print("在使用该脚本前,请保证程序所在的文件夹中仅包含程序、发票文件夹")
|
||
print("并且确保每个发票文件夹内只有三个文件:发票为.pdf文件,其他为图片")
|
||
input("按回车键开始...")
|
||
convert_all()
|
||
gen_docx()
|
||
print("生成 .pdf 文件中...")
|
||
convert("output.docx", "output.pdf")
|
||
print("生成完毕!")
|
||
|