Mary12 commited on
Commit
1844fcc
·
1 Parent(s): 8dd5245

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -74
app.py CHANGED
@@ -4,26 +4,14 @@ from pypdf import PdfReader
4
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
5
 
6
 
7
- # def remove_references(text):
8
- # text = re.sub(r'\[\d+\]', '', text) ##[ref]
9
- # text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
10
- # text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
11
- # # text = html.unescape(text)
12
- # text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
13
- # return text
14
-
15
  def remove_references(text):
16
- patterns = [
17
- r'\[\d+\]',
18
- r'\[https?://[^\[\]]+\s[^\[\]]+\]',
19
- r'\[https?://[^\[\]]+\]'
20
- ]
21
- combined_pattern = '|'.join(patterns)
22
- text = re.sub(combined_pattern, '', text)
23
- text = re.sub(r'\s+', ' ', text).strip()
24
  return text
25
 
26
-
27
 
28
  def extract_text_from_pdf(file_path):
29
  text = ""
@@ -32,73 +20,43 @@ def extract_text_from_pdf(file_path):
32
  text += page.extract_text() + "\n"
33
  return text
34
 
35
- def get_answer(pipe, question, context):
36
- result = pipe(question=question, context=context)
37
- answered = result['answer']
38
- return remove_references(answered)
39
 
40
- def handle_missing_input(context, question):
41
- if not context and not question:
42
- return "Որպեսզի ..."
43
- if not context:
44
- return "Ես չեմ կարողանամ ..."
45
- if not question:
46
- return "Ես չեմ կարողանամ ..."
47
- return None
 
48
 
49
- def clean_and_capitalize(text):
50
- text = text.replace('(', '', 1)
51
- text = text.replace(',', '', len(text)-1)
52
- return text.capitalize()
53
 
54
  def qa_result(context, question, file):
55
  model_name = "timpal0l/mdeberta-v3-base-squad2"
56
  pipe = model(model_name)
57
- error_message = handle_missing_input(context, question)
58
- if error_message:
59
- return error_message
60
-
61
  if file is not None:
62
  context = extract_text_from_pdf(file.name)
63
- text = get_answer(pipe, question, context)
64
- return clean_and_capitalize(text)
65
-
66
-
67
- # def model(model_name):
68
- # tokenizer = AutoTokenizer.from_pretrained(model_name)
69
- # model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
70
- # model_pipeline = pipeline(
71
- # "question-answering",
72
- # model = model,
73
- # tokenizer = tokenizer
74
- # )
75
-
76
- # return model_pipeline
77
-
78
- # def qa_result(context, question, file):
79
- # model_name = "timpal0l/mdeberta-v3-base-squad2"
80
- # pipe = model(model_name)
81
- # if file is not None:
82
- # context = extract_text_from_pdf(file.name)
83
- # result = pipe(question=question, context=context)
84
- # answered = result['answer']
85
- # text = remove_references(answered)
86
- # else:
87
 
88
- # if len(context) == 0 and len(question) == 0:
89
- # text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն ու հարցերը։"
90
- # elif len(context) == 0:
91
- # text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտրամադրես տեքստը"
92
- # elif len(question) == 0:
93
- # text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտաս հարցդ"
94
- # else:
95
- # result = pipe(question=question, context=context)
96
- # answered = result['answer']
97
- # text = remove_references(answered)
98
- # text = text.replace('(', '', 1)
99
- # text = text.replace(',', '', len(text)-1)
100
 
101
- # return text.capitalize()
102
 
103
  theme = gr.themes.Soft().set(
104
  body_background_fill='*background_fill_secondary',
 
4
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
5
 
6
 
 
 
 
 
 
 
 
 
7
  def remove_references(text):
8
+ text = re.sub(r'\[\d+\]', '', text) ##[ref]
9
+ text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
10
+ text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
11
+ # text = html.unescape(text)
12
+ text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
 
 
 
13
  return text
14
 
 
15
 
16
  def extract_text_from_pdf(file_path):
17
  text = ""
 
20
  text += page.extract_text() + "\n"
21
  return text
22
 
 
 
 
 
23
 
24
+
25
+ def model(model_name):
26
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
28
+ model_pipeline = pipeline(
29
+ "question-answering",
30
+ model = model,
31
+ tokenizer = tokenizer
32
+ )
33
 
34
+ return model_pipeline
 
 
 
35
 
36
  def qa_result(context, question, file):
37
  model_name = "timpal0l/mdeberta-v3-base-squad2"
38
  pipe = model(model_name)
 
 
 
 
39
  if file is not None:
40
  context = extract_text_from_pdf(file.name)
41
+ result = pipe(question=question, context=context)
42
+ answered = result['answer']
43
+ text = remove_references(answered)
44
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ if len(context) == 0 and len(question) == 0:
47
+ text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն ու հարցերը։"
48
+ elif len(context) == 0:
49
+ text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտրամադրես տեքստը"
50
+ elif len(question) == 0:
51
+ text = "Ես չեմ կարողանամ քեզ օգնել եթե ինձ չտաս հարցդ"
52
+ else:
53
+ result = pipe(question=question, context=context)
54
+ answered = result['answer']
55
+ text = remove_references(answered)
56
+ text = text.replace('(', '', 1)
57
+ text = text.replace(',', '', len(text)-1)
58
 
59
+ return text.capitalize()
60
 
61
  theme = gr.themes.Soft().set(
62
  body_background_fill='*background_fill_secondary',