summaryrefslogtreecommitdiff
path: root/FBGM.py
diff options
context:
space:
mode:
Diffstat (limited to 'FBGM.py')
-rw-r--r--FBGM.py13
1 files changed, 9 insertions, 4 deletions
diff --git a/FBGM.py b/FBGM.py
index 7344207..3a37c7c 100644
--- a/FBGM.py
+++ b/FBGM.py
@@ -4,8 +4,14 @@ import os, sys, docx, PyPDF2
from string import punctuation
from PyPDF2 import PdfReader
+def process_text(string):
+ string = remove_non_ascii(string)
+ string = remove_line_break(string)
+ return string
def remove_non_ascii(string):
return string.encode('ascii', errors='xmlcharrefreplace').decode()
+def remove_line_break(string):
+ return string.replace("- ","")
def html_file_generator(path, page_header, page_contents):
filename = path[path.rfind('/')+1: path.rfind('.')]
@@ -24,7 +30,7 @@ def html_file_generator(path, page_header, page_contents):
'</head>'
'<body>'
'<h1>' +page_header+ '</h1>')
- htmlfile.write(remove_non_ascii(page_contents))
+ htmlfile.write(process_text(page_contents))
htmlfile.write('</body>'
'</html>')
htmlfile.close()
@@ -99,7 +105,6 @@ def dochandler(path):
elif para.style.name in paragraph_style:
fullText+="<p>"+para.text
fullText+="</p>\n"
- print(para.text, para.style.name)
html_file_generator(path, title, fullText)
return '\n'.join(path)
@@ -139,8 +144,8 @@ def pdfhandler(path):
#Attempting to pass txt file in path
-path = 'C:/Code/texttohtml/kac.pdf'
-#path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx'
+#path = 'C:/Code/texttohtml/kac.pdf'
+path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx'
extension = path[path.rfind('.')+1: len(path)]