From 5789a123268db253f75836def35df3b54529bc88 Mon Sep 17 00:00:00 2001 From: Joshua Drake Date: Sun, 23 Oct 2022 00:22:13 -0500 Subject: Added Line Break and Ascii handlers. --- FBGM.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/FBGM.py b/FBGM.py index 7344207..3a37c7c 100644 --- a/FBGM.py +++ b/FBGM.py @@ -4,8 +4,14 @@ import os, sys, docx, PyPDF2 from string import punctuation from PyPDF2 import PdfReader +def process_text(string): + string = remove_non_ascii(string) + string = remove_line_break(string) + return string def remove_non_ascii(string): return string.encode('ascii', errors='xmlcharrefreplace').decode() +def remove_line_break(string): + return string.replace("- ","") def html_file_generator(path, page_header, page_contents): filename = path[path.rfind('/')+1: path.rfind('.')] @@ -24,7 +30,7 @@ def html_file_generator(path, page_header, page_contents): '' '' '

' +page_header+ '

') - htmlfile.write(remove_non_ascii(page_contents)) + htmlfile.write(process_text(page_contents)) htmlfile.write('' '') htmlfile.close() @@ -99,7 +105,6 @@ def dochandler(path): elif para.style.name in paragraph_style: fullText+="

"+para.text fullText+="

\n" - print(para.text, para.style.name) html_file_generator(path, title, fullText) return '\n'.join(path) @@ -139,8 +144,8 @@ def pdfhandler(path): #Attempting to pass txt file in path -path = 'C:/Code/texttohtml/kac.pdf' -#path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx' +#path = 'C:/Code/texttohtml/kac.pdf' +path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx' extension = path[path.rfind('.')+1: len(path)] -- cgit v1.2.3