ShayanRl commited on
Commit
f556cd6
·
verified ·
1 Parent(s): f92b95a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -84
app.py CHANGED
@@ -1,57 +1,13 @@
1
  import streamlit as st
2
  import io
3
  import requests
4
- import pdfplumber
5
- import os
6
- from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
7
 
8
- from huggingface_hub import login
9
 
10
- token = os.getenv("hf_token")
11
- login(token=token)
12
 
13
- batch_size = 96
14
- n_epochs = 2
15
- base_LM_model = "roberta-base"
16
- max_seq_len = 512
17
- learning_rate = 3e-5
18
- warmup_proportion = 0.2
19
- doc_stride=128
20
- max_query_length=64
21
 
22
 
23
-
24
-
25
-
26
- def AImodel(text,questionText):
27
- model_name = "deepset/roberta-base-squad2"
28
-
29
-
30
- # a) Get predictions
31
- nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
32
- QA_input = {
33
- 'question': questionText,
34
- 'context': text
35
- }
36
- res = nlp(QA_input)
37
-
38
-
39
-
40
- # b) Load model & tokenizer
41
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
42
- tokenizer = AutoTokenizer.from_pretrained(model_name)
43
- inputs = tokenizer(
44
- questionText, text,
45
- add_special_tokens=True,
46
- max_length=max_seq_len,
47
- truncation=True,
48
- return_tensors="pt",
49
- padding="max_length",
50
- stride=doc_stride,
51
- return_overflowing_tokens=True,
52
- return_offsets_mapping=True
53
- )
54
- return(res)
55
 
56
 
57
 
@@ -63,36 +19,12 @@ def fextractURL(pdf_path):
63
  # If the URL ends with .pdf, use pdfplumber directly
64
  r = requests.get(pdf_path)
65
  f = io.BytesIO(r.content)
66
- with pdfplumber.open(f) as pdf:
67
- for page in pdf.pages:
68
- extracted_data += page.extract_text() + "\n" # Extract text
69
- tables = page.extract_tables() # Extract tables
70
- for table in tables:
71
- for row in table:
72
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
73
- else:
74
- # If the URL does not end with .pdf, download the PDF first
75
- response = requests.get(pdf_path)
76
- pdf_content = response.content
77
-
78
- # Save the PDF locally
79
- pdf_filename = 'downloaded_document.pdf'
80
- with open(pdf_filename, 'wb') as pdf_file:
81
- pdf_file.write(pdf_content)
82
-
83
- # Extract content using pdfplumber
84
- with pdfplumber.open(pdf_filename) as pdf:
85
- for page in pdf.pages:
86
- extracted_data += page.extract_text() + "\n" # Extract text
87
- tables = page.extract_tables() # Extract tables
88
- for table in tables:
89
- for row in table:
90
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
91
-
92
- # Delete the PDF file
93
- os.remove(pdf_filename)
94
  except Exception as e:
95
- st.error(f"An error occurred: {str(e)}")
96
 
97
  return extracted_data
98
 
@@ -102,19 +34,13 @@ st.markdown(vert_space, unsafe_allow_html=True)
102
  st.write("Extract full text from PDF URL")
103
 
104
  pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
105
- questionText = st.text_input(label="question", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
106
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
107
  extractedText = st.empty()
108
 
109
  if button:
110
  try:
111
  text = fextractURL(pdfURL)
112
- AItext = AImodel(text,questionText)
113
- extractedText.text(AItext)
114
-
115
  except Exception as e:
116
- st.error(f"An error occurred: {str(e)}")
117
-
118
-
119
-
120
-
 
1
  import streamlit as st
2
  import io
3
  import requests
 
 
 
4
 
5
+ from docquery import document, pipeline
6
 
 
 
7
 
 
 
 
 
 
 
 
 
8
 
9
 
10
+ p = pipeline('document-question-answering')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
 
 
19
  # If the URL ends with .pdf, use pdfplumber directly
20
  r = requests.get(pdf_path)
21
  f = io.BytesIO(r.content)
22
+
23
+ doc = document.load_document(f)
24
+ for q in ["What is the 2022 net income?", "What is the 2023 net income ?"]:
25
+ extracted_data+= (q, p(question=q, **doc.context))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  except Exception as e:
27
+ st.error(f"An error o0000ccurred: {str(e)}")
28
 
29
  return extracted_data
30
 
 
34
  st.write("Extract full text from PDF URL")
35
 
36
  pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
 
37
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
38
  extractedText = st.empty()
39
 
40
  if button:
41
  try:
42
  text = fextractURL(pdfURL)
43
+ print(text)
44
+ extractedText.text(text)
 
45
  except Exception as e:
46
+ st.error(f"An error occurrrrred: {str(e)}")