Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,26 +4,14 @@ from pypdf import PdfReader
|
|
| 4 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
|
| 5 |
|
| 6 |
|
| 7 |
-
# def remove_references(text):
|
| 8 |
-
# text = re.sub(r'\[\d+\]', '', text) ##[ref]
|
| 9 |
-
# text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
|
| 10 |
-
# text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
|
| 11 |
-
# # text = html.unescape(text)
|
| 12 |
-
# text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
|
| 13 |
-
# return text
|
| 14 |
-
|
| 15 |
def remove_references(text):
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
combined_pattern = '|'.join(patterns)
|
| 22 |
-
text = re.sub(combined_pattern, '', text)
|
| 23 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
| 24 |
return text
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
def extract_text_from_pdf(file_path):
|
| 29 |
text = ""
|
|
@@ -32,73 +20,43 @@ def extract_text_from_pdf(file_path):
|
|
| 32 |
text += page.extract_text() + "\n"
|
| 33 |
return text
|
| 34 |
|
| 35 |
-
def get_answer(pipe, question, context):
|
| 36 |
-
result = pipe(question=question, context=context)
|
| 37 |
-
answered = result['answer']
|
| 38 |
-
return remove_references(answered)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
-
text = text.replace('(', '', 1)
|
| 51 |
-
text = text.replace(',', '', len(text)-1)
|
| 52 |
-
return text.capitalize()
|
| 53 |
|
| 54 |
def qa_result(context, question, file):
|
| 55 |
model_name = "timpal0l/mdeberta-v3-base-squad2"
|
| 56 |
pipe = model(model_name)
|
| 57 |
-
error_message = handle_missing_input(context, question)
|
| 58 |
-
if error_message:
|
| 59 |
-
return error_message
|
| 60 |
-
|
| 61 |
if file is not None:
|
| 62 |
context = extract_text_from_pdf(file.name)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
# def model(model_name):
|
| 68 |
-
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 69 |
-
# model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
|
| 70 |
-
# model_pipeline = pipeline(
|
| 71 |
-
# "question-answering",
|
| 72 |
-
# model = model,
|
| 73 |
-
# tokenizer = tokenizer
|
| 74 |
-
# )
|
| 75 |
-
|
| 76 |
-
# return model_pipeline
|
| 77 |
-
|
| 78 |
-
# def qa_result(context, question, file):
|
| 79 |
-
# model_name = "timpal0l/mdeberta-v3-base-squad2"
|
| 80 |
-
# pipe = model(model_name)
|
| 81 |
-
# if file is not None:
|
| 82 |
-
# context = extract_text_from_pdf(file.name)
|
| 83 |
-
# result = pipe(question=question, context=context)
|
| 84 |
-
# answered = result['answer']
|
| 85 |
-
# text = remove_references(answered)
|
| 86 |
-
# else:
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
| 102 |
|
| 103 |
theme = gr.themes.Soft().set(
|
| 104 |
body_background_fill='*background_fill_secondary',
|
|
|
|
| 4 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def remove_references(text):
|
| 8 |
+
text = re.sub(r'\[\d+\]', '', text) ##[ref]
|
| 9 |
+
text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
|
| 10 |
+
text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
|
| 11 |
+
# text = html.unescape(text)
|
| 12 |
+
text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
|
|
|
|
|
|
|
|
|
|
| 13 |
return text
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
def extract_text_from_pdf(file_path):
|
| 17 |
text = ""
|
|
|
|
| 20 |
text += page.extract_text() + "\n"
|
| 21 |
return text
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
|
| 25 |
+
def model(model_name):
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 27 |
+
model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
|
| 28 |
+
model_pipeline = pipeline(
|
| 29 |
+
"question-answering",
|
| 30 |
+
model = model,
|
| 31 |
+
tokenizer = tokenizer
|
| 32 |
+
)
|
| 33 |
|
| 34 |
+
return model_pipeline
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def qa_result(context, question, file):
|
| 37 |
model_name = "timpal0l/mdeberta-v3-base-squad2"
|
| 38 |
pipe = model(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if file is not None:
|
| 40 |
context = extract_text_from_pdf(file.name)
|
| 41 |
+
result = pipe(question=question, context=context)
|
| 42 |
+
answered = result['answer']
|
| 43 |
+
text = remove_references(answered)
|
| 44 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
if len(context) == 0 and len(question) == 0:
|
| 47 |
+
text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն ու հարցերը։"
|
| 48 |
+
elif len(context) == 0:
|
| 49 |
+
text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտրամադրես տեքստը"
|
| 50 |
+
elif len(question) == 0:
|
| 51 |
+
text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտաս հարցդ"
|
| 52 |
+
else:
|
| 53 |
+
result = pipe(question=question, context=context)
|
| 54 |
+
answered = result['answer']
|
| 55 |
+
text = remove_references(answered)
|
| 56 |
+
text = text.replace('(', '', 1)
|
| 57 |
+
text = text.replace(',', '', len(text)-1)
|
| 58 |
|
| 59 |
+
return text.capitalize()
|
| 60 |
|
| 61 |
theme = gr.themes.Soft().set(
|
| 62 |
body_background_fill='*background_fill_secondary',
|