Spaces:
Runtime error
Runtime error
error handling
Browse files
app.py
CHANGED
|
@@ -32,7 +32,7 @@ def process_pdfs(parent_dir: Union[str,list]):
|
|
| 32 |
parent_dir = [parent_dir]
|
| 33 |
for file_path in parent_dir:
|
| 34 |
if ".pdf" not in file_path : # skip non pdf files
|
| 35 |
-
|
| 36 |
# creating a pdf file object
|
| 37 |
pdfFileObj = open(file_path, 'rb')
|
| 38 |
|
|
@@ -48,8 +48,8 @@ def process_pdfs(parent_dir: Union[str,list]):
|
|
| 48 |
txt = txt.replace("\t","") # strip tabs
|
| 49 |
txt = re.sub(r" +"," ",txt) # strip extra space
|
| 50 |
# 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
|
|
|
|
| 51 |
if len(txt) < 512 :
|
| 52 |
-
file_name = file_path.split("/")[-1]
|
| 53 |
new_data = {"title":f"{file_name}-page-{i}","text":txt}
|
| 54 |
df = df.append(new_data,ignore_index=True)
|
| 55 |
else :
|
|
@@ -70,6 +70,8 @@ def process(example):
|
|
| 70 |
|
| 71 |
def process_dataset(df):
|
| 72 |
"""processess the dataframe and returns a dataset variable"""
|
|
|
|
|
|
|
| 73 |
ds = Dataset.from_pandas(df)
|
| 74 |
ds = ds.map(process)
|
| 75 |
ds.add_faiss_index(column='embeddings') # add faiss index
|
|
@@ -77,19 +79,26 @@ def process_dataset(df):
|
|
| 77 |
|
| 78 |
def search(query, ds, k=3):
|
| 79 |
"""searches the query in the dataset and returns the k most similar"""
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
return out
|
| 87 |
|
| 88 |
def predict(query,file_paths, k=3):
|
| 89 |
"""predicts the most similar files to the query"""
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
with gr.Blocks() as demo :
|
| 95 |
with gr.Column():
|
|
|
|
| 32 |
parent_dir = [parent_dir]
|
| 33 |
for file_path in parent_dir:
|
| 34 |
if ".pdf" not in file_path : # skip non pdf files
|
| 35 |
+
raise Exception("only pdf files are supported")
|
| 36 |
# creating a pdf file object
|
| 37 |
pdfFileObj = open(file_path, 'rb')
|
| 38 |
|
|
|
|
| 48 |
txt = txt.replace("\t","") # strip tabs
|
| 49 |
txt = re.sub(r" +"," ",txt) # strip extra space
|
| 50 |
# 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
|
| 51 |
+
file_name = file_path.split("/")[-1]
|
| 52 |
if len(txt) < 512 :
|
|
|
|
| 53 |
new_data = {"title":f"{file_name}-page-{i}","text":txt}
|
| 54 |
df = df.append(new_data,ignore_index=True)
|
| 55 |
else :
|
|
|
|
| 70 |
|
| 71 |
def process_dataset(df):
|
| 72 |
"""processess the dataframe and returns a dataset variable"""
|
| 73 |
+
if len(df) == 0 :
|
| 74 |
+
raise Exception("empty pdf files, or can't read text from them")
|
| 75 |
ds = Dataset.from_pandas(df)
|
| 76 |
ds = ds.map(process)
|
| 77 |
ds.add_faiss_index(column='embeddings') # add faiss index
|
|
|
|
| 79 |
|
| 80 |
def search(query, ds, k=3):
|
| 81 |
"""searches the query in the dataset and returns the k most similar"""
|
| 82 |
+
try :
|
| 83 |
+
tokens = q_tokenizer(query, return_tensors="pt")
|
| 84 |
+
query_embed = q_encoder(**tokens)[0][0].numpy()
|
| 85 |
+
scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
|
| 86 |
+
out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
|
| 87 |
+
similar resources: {retrieved_examples["title"]}
|
| 88 |
+
"""
|
| 89 |
+
except Exception as e:
|
| 90 |
+
out = f"error: {e}"
|
| 91 |
return out
|
| 92 |
|
| 93 |
def predict(query,file_paths, k=3):
|
| 94 |
"""predicts the most similar files to the query"""
|
| 95 |
+
try :
|
| 96 |
+
df = process_pdfs(file_paths)
|
| 97 |
+
ds = process_dataset(df)
|
| 98 |
+
out = search(query,ds,k=k)
|
| 99 |
+
except Exception as e:
|
| 100 |
+
out = f"error: {e}"
|
| 101 |
+
return out
|
| 102 |
|
| 103 |
with gr.Blocks() as demo :
|
| 104 |
with gr.Column():
|