Python pdfminer将 pdf 文件转化为 txt 文档

发布时间: 更新时间: 总字数:233 阅读时间:1m 作者: IP上海 分享 网址

使用 Python 将 pdf 文件转化为 txt 文档

安装依赖

pip install pdfminer==20140328

脚本

pdf2txt.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from io import BytesIO as StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter


class Pdf2Txt:

    def __init__(self, pdf_path, txt_path):
        self.pdf_path = pdf_path
        self.txt_path = txt_path

    def read_pdf(self):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(self.pdf_path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                      password=password, caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        out = retstr.getvalue()
        retstr.close()
        return out

    def write_to_txt(self, pdf_info):
        with open(self.txt_path, 'w') as f:
            f.write(pdf_info)


def main():
    pdf_path = input('请输入PDF文件绝对路径:')
    txt_path = input('请输入保存的TXT文本名字:')
    pdf_txt = Pdf2Txt(pdf_path, txt_path)
    pdf_info = pdf_txt.read_pdf()
    pdf_txt.write_to_txt(pdf_info)


if __name__ == '__main__':
    main()

使用

python pdf2txt.py

然后按照提示,输入文件路径即可。

Home Archives Categories Tags Statistics
本文总阅读量 次 本站总访问量 次 本站总访客数