diff options
author | Daniel Behmer <behmer_d@yahoo.com> | 2022-10-22 10:49:38 -0700 |
---|---|---|
committer | Daniel Behmer <behmer_d@yahoo.com> | 2022-10-22 10:49:38 -0700 |
commit | 0a086697e3266ce68bee1b3628d7e7de95620f15 (patch) | |
tree | 4e40f155e862660df9b0d2e50bbb2dae0fd11058 | |
parent | 3fab0721a01002e6df67b7e5d999b1e306265202 (diff) |
Started pdf converter
-rw-r--r-- | FBGM.py | 35 |
1 files changed, 28 insertions, 7 deletions
@@ -1,12 +1,16 @@ from email import header from ftplib import parse150 -import os, sys, docx +import os, sys, docx, PyPDF2 from string import punctuation +from PyPDF2 import PdfReader + +def remove_non_ascii(string): + return string.encode('ascii', errors='xmlcharrefreplace').decode() def html_file_generator(path, page_header, page_contents): filename = path[path.rfind('/')+1: path.rfind('.')] - if os.path.exists(filename+".html") == True: + if os.path.exists(filename+".html") == True: os.remove(filename+".html") htmlfile = open(filename+".html", "x") @@ -20,9 +24,12 @@ def html_file_generator(path, page_header, page_contents): '</head>' '<body>' '<h1>' +page_header+ '</h1>') - htmlfile.write(page_contents) + htmlfile.write(remove_non_ascii(page_contents)) htmlfile.write('</body>' '</html>') + htmlfile.close() + return 0 + #Text file handler @@ -110,16 +117,30 @@ def txthandler(path): print(f.read()) return 0 +#def header_footer_remover(page_text): + #same_header, same_footer = [] + #for same_header + #return(page_text) + +#def html_file_generator(path, page_header, page_contents): + def pdfhandler(path): - f = open(path) - print(f.read()) + reader = PdfReader(path) + number_of_pages = len(reader.pages) + page_text = [] + #print(number_of_pages) + for x in range(number_of_pages -1): + page = reader.pages[x] + page_text.append(page.extract_text()) + #print(page_text[4]) + html_file_generator(path, "penis", paragraph_parser("".join(page_text))) return 0 #Attempting to pass txt file in path -#path = 'C:/Code/texttohtml/ThisOne.docx' -path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx' +path = 'C:/Code/texttohtml/kac.pdf' +#path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx' extension = path[path.rfind('.')+1: len(path)] |