summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Behmer <behmer_d@yahoo.com>2022-10-22 10:49:38 -0700
committerDaniel Behmer <behmer_d@yahoo.com>2022-10-22 10:49:38 -0700
commit0a086697e3266ce68bee1b3628d7e7de95620f15 (patch)
tree4e40f155e862660df9b0d2e50bbb2dae0fd11058
parent3fab0721a01002e6df67b7e5d999b1e306265202 (diff)
Started pdf converter
-rw-r--r--FBGM.py35
1 files changed, 28 insertions, 7 deletions
diff --git a/FBGM.py b/FBGM.py
index 349693e..7344207 100644
--- a/FBGM.py
+++ b/FBGM.py
@@ -1,12 +1,16 @@
from email import header
from ftplib import parse150
-import os, sys, docx
+import os, sys, docx, PyPDF2
from string import punctuation
+from PyPDF2 import PdfReader
+
+def remove_non_ascii(string):
+ return string.encode('ascii', errors='xmlcharrefreplace').decode()
def html_file_generator(path, page_header, page_contents):
filename = path[path.rfind('/')+1: path.rfind('.')]
- if os.path.exists(filename+".html") == True:
+ if os.path.exists(filename+".html") == True:
os.remove(filename+".html")
htmlfile = open(filename+".html", "x")
@@ -20,9 +24,12 @@ def html_file_generator(path, page_header, page_contents):
'</head>'
'<body>'
'<h1>' +page_header+ '</h1>')
- htmlfile.write(page_contents)
+ htmlfile.write(remove_non_ascii(page_contents))
htmlfile.write('</body>'
'</html>')
+ htmlfile.close()
+ return 0
+
#Text file handler
@@ -110,16 +117,30 @@ def txthandler(path):
print(f.read())
return 0
+#def header_footer_remover(page_text):
+ #same_header, same_footer = []
+ #for same_header
+ #return(page_text)
+
+#def html_file_generator(path, page_header, page_contents):
+
def pdfhandler(path):
- f = open(path)
- print(f.read())
+ reader = PdfReader(path)
+ number_of_pages = len(reader.pages)
+ page_text = []
+ #print(number_of_pages)
+ for x in range(number_of_pages -1):
+ page = reader.pages[x]
+ page_text.append(page.extract_text())
+ #print(page_text[4])
+ html_file_generator(path, "penis", paragraph_parser("".join(page_text)))
return 0
#Attempting to pass txt file in path
-#path = 'C:/Code/texttohtml/ThisOne.docx'
-path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx'
+path = 'C:/Code/texttohtml/kac.pdf'
+#path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx'
extension = path[path.rfind('.')+1: len(path)]