python - pdfminer/poppler - how to set encoding -
i have file, i.e. http://www.agfl.cs.ru.nl/papers/manual28.pdf (it's english)
pdfminer , poppler shows same result in parsed pages, like: ¾º¿  ÒÙ Öݸ ¾¼¼ Ⱥ ¾º ÂÙÒ ¸ ¾¼¼ ź Ë ÙØØ Ö¸ Ǻ Ë
it seems can't read font custom encodings. how specify it?
here's code samples:
# poppler input_filename = '/tmp/manual28.pdf' document = poppler.document_new_from_file('file://%s' % urllib.pathname2url(os.path.abspath(input_filename)), none) n_pages = document.get_n_pages() in range(n_pages): page = document.get_page(i) print page.get_text() # chardet.detect(page.get_text()) # utf8 time # pdfminer def pdf_to_html(in_fp, out_fp, codec='utf-8', maxpages=0, pagenos=none, html=true): rsrcmgr = pdfresourcemanager() laparams = laparams() if isinstance(in_fp, basestring): in_fp = open(in_fp, 'rb') if isinstance(out_fp, basestring): out_fp = open(out_fp, 'wb') if html: device = htmlconverter(rsrcmgr, out_fp, codec=codec, laparams=laparams) else: device = textconverter(rsrcmgr, out_fp, codec=codec, laparams=laparams) interpreter = pdfpageinterpreter(rsrcmgr, device) page in pdfpage.get_pages(in_fp, pagenos, maxpages=maxpages): interpreter.process_page(page) in_fp.close() device.close() out_fp.close()
Comments
Post a Comment