使用 Python 将 pdf 文件转化为 txt 文档
安装依赖
pip install pdfminer==20140328
脚本
pdf2txt.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from io import BytesIO as StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
class Pdf2Txt:
def __init__(self, pdf_path, txt_path):
self.pdf_path = pdf_path
self.txt_path = txt_path
def read_pdf(self):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(self.pdf_path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
out = retstr.getvalue()
retstr.close()
return out
def write_to_txt(self, pdf_info):
with open(self.txt_path, 'w') as f:
f.write(pdf_info)
def main():
pdf_path = input('请输入PDF文件绝对路径:')
txt_path = input('请输入保存的TXT文本名字:')
pdf_txt = Pdf2Txt(pdf_path, txt_path)
pdf_info = pdf_txt.read_pdf()
pdf_txt.write_to_txt(pdf_info)
if __name__ == '__main__':
main()
使用
python pdf2txt.py
然后按照提示,输入文件路径即可。