Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
·
4107940
1
Parent(s):
d36f6ee
Adds file support (txt, pdf, csv)
Browse files- interface/components.py +25 -1
- interface/pages.py +2 -0
- interface/utils.py +60 -0
- requirements.txt +2 -1
interface/components.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from interface.utils import get_pipelines, extract_text_from_url
|
| 3 |
from interface.draw_pipelines import get_pipeline_graph
|
| 4 |
|
| 5 |
|
|
@@ -84,3 +84,27 @@ def component_article_url(container):
|
|
| 84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
| 85 |
]
|
| 86 |
return corpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
|
| 3 |
from interface.draw_pipelines import get_pipeline_graph
|
| 4 |
|
| 5 |
|
|
|
|
| 84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
| 85 |
]
|
| 86 |
return corpus
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def component_file_input(container):
|
| 90 |
+
"""Draw the extract text from file widget"""
|
| 91 |
+
with container:
|
| 92 |
+
files = []
|
| 93 |
+
doc_id = 1
|
| 94 |
+
with st.expander("Enter Files"):
|
| 95 |
+
while True:
|
| 96 |
+
file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
|
| 97 |
+
if file != None:
|
| 98 |
+
extracted_text = extract_text_from_file(file)
|
| 99 |
+
if extracted_text != None:
|
| 100 |
+
files.append({"text": extracted_text})
|
| 101 |
+
doc_id += 1
|
| 102 |
+
st.markdown("---")
|
| 103 |
+
else:
|
| 104 |
+
break
|
| 105 |
+
else:
|
| 106 |
+
break
|
| 107 |
+
corpus = [
|
| 108 |
+
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
| 109 |
+
]
|
| 110 |
+
return corpus
|
interface/pages.py
CHANGED
|
@@ -2,6 +2,7 @@ import streamlit as st
|
|
| 2 |
from streamlit_option_menu import option_menu
|
| 3 |
from core.search_index import index, search
|
| 4 |
from interface.components import (
|
|
|
|
| 5 |
component_show_pipeline,
|
| 6 |
component_show_search_result,
|
| 7 |
component_text_input,
|
|
@@ -59,6 +60,7 @@ def page_index(container):
|
|
| 59 |
input_funcs = {
|
| 60 |
"Raw Text": (component_text_input, "card-text"),
|
| 61 |
"URL": (component_article_url, "card-link"),
|
|
|
|
| 62 |
}
|
| 63 |
selected_input = option_menu(
|
| 64 |
"Input Text",
|
|
|
|
| 2 |
from streamlit_option_menu import option_menu
|
| 3 |
from core.search_index import index, search
|
| 4 |
from interface.components import (
|
| 5 |
+
component_file_input,
|
| 6 |
component_show_pipeline,
|
| 7 |
component_show_search_result,
|
| 8 |
component_text_input,
|
|
|
|
| 60 |
input_funcs = {
|
| 61 |
"Raw Text": (component_text_input, "card-text"),
|
| 62 |
"URL": (component_article_url, "card-link"),
|
| 63 |
+
"File": (component_file_input, "card-file"),
|
| 64 |
}
|
| 65 |
selected_input = option_menu(
|
| 66 |
"Input Text",
|
interface/utils.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
| 1 |
import core.pipelines as pipelines_functions
|
| 2 |
from inspect import getmembers, isfunction
|
| 3 |
from newspaper import Article
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def get_pipelines():
|
|
@@ -21,3 +24,60 @@ def extract_text_from_url(url: str):
|
|
| 21 |
article.parse()
|
| 22 |
|
| 23 |
return article.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import StringIO
|
| 2 |
import core.pipelines as pipelines_functions
|
| 3 |
from inspect import getmembers, isfunction
|
| 4 |
from newspaper import Article
|
| 5 |
+
from PyPDF2 import PdfFileReader
|
| 6 |
import streamlit as st
|
| 7 |
+
import pandas as pd
|
| 8 |
|
| 9 |
|
| 10 |
def get_pipelines():
|
|
|
|
| 24 |
article.parse()
|
| 25 |
|
| 26 |
return article.text
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def extract_text_from_file(file):
|
| 30 |
+
# read text file
|
| 31 |
+
if file.type == "text/plain":
|
| 32 |
+
# To convert to a string based IO:
|
| 33 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
| 34 |
+
|
| 35 |
+
# To read file as string:
|
| 36 |
+
file_text = stringio.read()
|
| 37 |
+
|
| 38 |
+
return file_text
|
| 39 |
+
|
| 40 |
+
# read pdf file
|
| 41 |
+
elif file.type == "application/pdf":
|
| 42 |
+
pdfReader = PdfFileReader(file)
|
| 43 |
+
count = pdfReader.numPages
|
| 44 |
+
all_text = ""
|
| 45 |
+
|
| 46 |
+
for i in range(count):
|
| 47 |
+
try:
|
| 48 |
+
page = pdfReader.getPage(i)
|
| 49 |
+
all_text += page.extractText()
|
| 50 |
+
except:
|
| 51 |
+
continue
|
| 52 |
+
file_text = all_text
|
| 53 |
+
|
| 54 |
+
return file_text
|
| 55 |
+
|
| 56 |
+
# read csv file
|
| 57 |
+
elif file.type == "text/csv":
|
| 58 |
+
csv = pd.read_csv(file)
|
| 59 |
+
# get columns of type string
|
| 60 |
+
string_columns = csv.select_dtypes(include=['object']).columns
|
| 61 |
+
# get data from columns and join it together
|
| 62 |
+
file_text = ""
|
| 63 |
+
for row in csv[string_columns].values.tolist():
|
| 64 |
+
# remove NaNs
|
| 65 |
+
row = [x for x in row if str(x) != 'nan']
|
| 66 |
+
for column in row:
|
| 67 |
+
txt = ""
|
| 68 |
+
if isinstance(column, list):
|
| 69 |
+
try:
|
| 70 |
+
txt = " ".join(column)
|
| 71 |
+
except:
|
| 72 |
+
continue
|
| 73 |
+
elif isinstance(column, str):
|
| 74 |
+
txt = column
|
| 75 |
+
else:
|
| 76 |
+
continue
|
| 77 |
+
file_text += " " + txt
|
| 78 |
+
return file_text
|
| 79 |
+
|
| 80 |
+
else:
|
| 81 |
+
st.warning(f"File type {file.type} not supported")
|
| 82 |
+
return None
|
| 83 |
+
|
requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ streamlit_option_menu==0.3.2
|
|
| 3 |
farm-haystack==1.8.0
|
| 4 |
black==22.8.0
|
| 5 |
plotly==5.10.0
|
| 6 |
-
newspaper3k==0.2.8
|
|
|
|
|
|
| 3 |
farm-haystack==1.8.0
|
| 4 |
black==22.8.0
|
| 5 |
plotly==5.10.0
|
| 6 |
+
newspaper3k==0.2.8
|
| 7 |
+
PyPDF2==2.10.7
|