Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from PIL import Image | |
| # from pdf2image import convert_from_path | |
| import pandas as pd | |
| import yake | |
| import fitz | |
| import nltk | |
| from gtts import gTTS | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import string | |
| import os | |
| import re | |
| os.system('pip install -q pytesseract') | |
| import pytesseract | |
| st.title("Extract info from Files") | |
| st.sidebar.title('Hyper Params') | |
| menu = ["Image","Dataset","DocumentFiles","About"] | |
| choice = st.sidebar.selectbox("Select the type of data", menu) | |
| no_of_keys = st.sidebar.slider('Select the no of keywords', 1, 20, 2, 2) | |
| output = 'response' | |
| output = st.selectbox('Select the type of output', ('keys', 'response')) | |
| # pre processing the images | |
| filters = ['Gaussian', 'Low pass', 'High Pass', 'System defined'] | |
| filter = st.sidebar.selectbox("Select the type of filter to preprocess the image", filters) | |
| # tes = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' | |
| # pytesseract.pytesseract.tesseract_cmd = tes | |
| extractor = yake.KeywordExtractor() | |
| language = 'en' | |
| max_ngram_size = st.sidebar.slider('Select the parameter for ngram', 1, 20, 3, 2) | |
| deduplication_threshold = st.sidebar.slider('Select the parameter for DD threshold', 1, 10, 9, 1) | |
| deduplication_threshold = deduplication_threshold/10 | |
| numOfKeywords = 100 | |
| custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None) | |
| lemmer = nltk.stem.WordNetLemmatizer() | |
| def LemTokens(tokens): | |
| return [lemmer.lemmatize(token) for token in tokens] | |
| remove_punct_dict= dict((ord(punct), None) for punct in string.punctuation) | |
| def LemNormalize(text): | |
| return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict))) | |
| def rees(glo_text, keys): | |
| for key in keys[:no_of_keys]: | |
| # st.write(type(glo_text)) | |
| sent_tokens = nltk.sent_tokenize(glo_text) | |
| word_tokens = nltk.word_tokenize(glo_text) | |
| sent_tokens.append(key) | |
| word_tokens = word_tokens + nltk.word_tokenize(key) | |
| TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english') | |
| tfidf = TfidfVec.fit_transform(sent_tokens) | |
| vals = cosine_similarity(tfidf[-1], tfidf) | |
| idx = vals.argsort()[0][-2] | |
| response = sent_tokens[idx] | |
| if(output == 'response'): | |
| st.write(' - ' + key + ':' + response) | |
| else: | |
| st.write(' - ' + key) | |
| response = re.sub("[^a-zA-Z0-9]","",response) | |
| myobj = gTTS(text=response, lang=language, slow=False) | |
| myobj.save("audio.mp3") | |
| st.audio("audio.mp3", format='audio/ogg') | |
| os.remove("audio.mp3") | |
| def load_image(image_file): | |
| img = Image.open(image_file) | |
| st.image(img, width=250) | |
| text = pytesseract.image_to_string(img) | |
| img.close() | |
| return text | |
| # text = pytesseract.image_to_string(img) | |
| def load_pdf(data_file): | |
| doc = fitz.open(stream=data_file.read(), filetype="pdf") | |
| text = "" | |
| glo_text = '' | |
| for page in doc: | |
| text = text + page.get_text() | |
| glo_text += text | |
| keywords = custom_kw_extractor.extract_keywords(text) | |
| for kw in keywords[::-1]: | |
| if(kw[1] > 0.1): | |
| keys.append(kw[0]) | |
| # st.write(keys) | |
| doc.close() | |
| return glo_text, keys | |
| keys = [] | |
| def tes_image(image_file): | |
| if image_file != None: | |
| # add filters if time permits | |
| glo_text = '' | |
| # text = pytesseract.image_to_string(load_image(image_file)) # can add a specific language to detect the text on the screen | |
| # st.image(load_image(image_file),width=250) | |
| # st.write(text) | |
| text = load_image(image_file) | |
| glo_text += text | |
| keywords = custom_kw_extractor.extract_keywords(text) | |
| for kw in keywords[::-1]: | |
| if(kw[1] > 0.1): | |
| keys.append(kw[0]) | |
| # st.write(keys) | |
| return glo_text, keys | |
| def tes_doc(data_file): | |
| if data_file != None: | |
| tup = load_pdf(data_file) | |
| return tup | |
| def convert_df_to_text(df): | |
| pass # implement key to text here using key2text package | |
| if choice == "Image": | |
| st.subheader("Image") | |
| image_file = st.file_uploader("Upload Images", type=["png","jpg","jpeg"]) | |
| if image_file != None: | |
| file_details = {"filename":image_file.name, "filetype":image_file.type, "filesize":image_file.size} | |
| st.write(file_details) | |
| glo_text, keys = tes_image(image_file) | |
| rees(glo_text, keys) | |
| elif choice == "Dataset": | |
| st.subheader("Dataset") | |
| data_file = st.file_uploader("Upload CSV",type=["csv"]) | |
| if data_file != None: | |
| file_details = {"filename":data_file, "filetype":data_file.type, "filesize":data_file.size} | |
| st.write(file_details) | |
| df = pd.read_csv(data_file) | |
| st.write(df) | |
| convert_df_to_text(df) | |
| elif choice == "DocumentFiles": | |
| st.subheader("DocumentFiles") | |
| docx_file = st.file_uploader("Upload Document", type=["pdf","docx","txt"]) | |
| if st.button("Process"): | |
| if docx_file is not None: | |
| file_details = {"filename":docx_file.name, "filetype":docx_file.type, "filesize":docx_file.size} | |
| st.write(file_details) | |
| glo_text, keys = tes_doc(docx_file) | |
| rees(glo_text, keys) |