You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
3.6 KiB

2 years ago
import platform
import os
import fitz # pip install PyMuPDF
from PIL import Image
import shutil
import logging
# 将word文件转换成pdf文件
def word2pdf(word_file):
from win32com import client # pip install pywin32
# 获取word格式处理对象
word = client.Dispatch('Word.Application')
# 以Doc对象打开文件
doc_ = word.Documents.Open(word_file)
# 另存为pdf文件
pdf_file = word_file.replace(os.path.basename(word_file).split('.')[1], "pdf")
doc_.SaveAs(pdf_file, FileFormat=17)
logging.info(f'{word_file} ----转pdf成功')
# 关闭doc对象
doc_.Close()
# 退出word对象
word.Quit()
return pdf_file
# 将word文件转换成pdf文件(Linux)
def word2pdf_linux(word_file):
word_path = os.path.dirname(word_file)
os.system(f"libreoffice --headless --language=zh-CN --convert-to pdf {word_file} --outdir {word_path}")
logging.info(f'{word_file} ----转pdf成功')
pdf_file = word_file.replace(os.path.basename(word_file).split('.')[1], "pdf")
return pdf_file
# pdf转图片
def pdf2png(pdf_file):
image_path = os.path.abspath(f'{os.path.dirname(pdf_file)}/tmp_pdf2png')
try:
# 创建一个空白图片,用于拼接内容
width, height = 0, 0
images = []
pdf_doc = fitz.open(pdf_file)
for pg in range(pdf_doc.page_count):
page = pdf_doc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=96
zoom_x = 1.33333333 # (1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 1.33333333
mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pix = page.get_pixmap(matrix=mat, alpha=False)
if not os.path.exists(image_path): # 判断存放图片的文件夹是否存在
os.makedirs(image_path) # 若图片文件夹不存在就创建
pix.save(image_path + '/' + 'tmp%s.png' % pg) # 将图片写入指定的文件夹内
img = Image.open(image_path + '/' + 'tmp%s.png' % pg)
img_width, img_height = img.size
# 更新拼接图片的宽度和高度
width = max(width, img_width)
height += img_height
# 添加图片到拼接列表
images.append(img)
# 创建一个空白长图
long_image = Image.new('RGB', (width, height), (255, 255, 255))
y_offset = 0
# 将每张图片拼接到长图中
for img in images:
long_image.paste(img, (0, y_offset))
y_offset += img.height
# 保存拼接后的长图
png_file = pdf_file.replace(os.path.basename(pdf_file).split('.')[1], "png")
long_image.save(png_file)
# 删除中间临时保存的图片
shutil.rmtree(image_path)
except IOError as error:
logging.error('pdf转png失败')
raise error
else:
logging.info("pdf转png成功")
return png_file
def word_to_long_image(word_file_path):
try:
_file = os.path.abspath(word_file_path) # os.path.abspath('input.docx')
if platform.system().lower() == 'windows':
pdf_file = word2pdf(_file)
else:
pdf_file = word2pdf_linux(_file)
png_file = pdf2png(pdf_file)
# 删除中间保存的pdf文件
os.remove(pdf_file)
return png_file
except Exception as error:
logging.error('word转长图出错:{}'.format(error))
raise error