from email import header from ftplib import parse150 import os, sys, docx, PyPDF2 from string import punctuation from PyPDF2 import PdfReader def remove_non_ascii(string): return string.encode('ascii', errors='xmlcharrefreplace').decode() def html_file_generator(path, page_header, page_contents): filename = path[path.rfind('/')+1: path.rfind('.')] if os.path.exists(filename+".html") == True: os.remove(filename+".html") htmlfile = open(filename+".html", "x") htmlfile.write('' '' '
' '"+paragraph[paragraph_number] paragraph_number +=1 paragraph[paragraph_number]=paragraph[paragraph_number].replace("\n"," ") paragraph[paragraph_number]=paragraph[paragraph_number].strip() paragraph[paragraph_number]="
"+paragraph[paragraph_number][len(article[article_index.index(x)-1]):] print("Header "+str(x)+" is in paragraph "+str(paragraph_number)) paragraph[-1] = paragraph[-1][:-5]+paragraph[-1][-4:] return ''.join(paragraph) #Document file Handler def dochandler(path): doc = docx.Document(path) header_styles= ["Subtitle","Heading 1", "Heading 2"] paragraph_style= ["Normal","No Spacing"] title = (path[path.rfind('/')+1: path.rfind('.')]).capitalize() if doc.paragraphs[0].style.name == "Title": title = doc.paragraphs[0].text fullText = "" for para in doc.paragraphs: if para.style.name in header_styles: fullText+="
"+para.text fullText+="
\n" print(para.text, para.style.name) html_file_generator(path, title, fullText) return '\n'.join(path) def txthandler(path): f = open(path, "r") contents = f.read() main_header = contents[0:contents.find('\n')] paragraphs = paragraph_parser(contents[contents.find('\n'):]) html_file_generator(path,main_header,paragraphs) print(f.read()) return 0 #def header_footer_remover(page_text): #same_header, same_footer = [] #for same_header #return(page_text) #def html_file_generator(path, page_header, page_contents): def pdfhandler(path): reader = PdfReader(path) number_of_pages = len(reader.pages) page_text = [] #print(number_of_pages) for x in range(number_of_pages -1): page = reader.pages[x] page_text.append(page.extract_text()) #print(page_text[4]) html_file_generator(path, "penis", paragraph_parser("".join(page_text))) return 0 #Attempting to pass txt file in path path = 'C:/Code/texttohtml/kac.pdf' #path = 'C:/Users/Josh/Desktop/porkandbeans/texttohtml/memes.docx' extension = path[path.rfind('.')+1: len(path)] if extension =='txt': txthandler(path) elif extension == 'pdf': pdfhandler(path) elif extension == 'doc' or 'docx': dochandler(path) else: print("Extension not recognized") sys.exit()