ShayanRl commited on
Commit
ea4451e
·
verified ·
1 Parent(s): 6cfecea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -11
app.py CHANGED
@@ -1,14 +1,32 @@
1
  import streamlit as st
2
  import io
3
  import requests
 
 
 
4
 
5
- from docquery import document, pipeline
6
 
 
 
 
7
 
8
 
9
 
10
- p = pipeline('document-question-answering')
11
-
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  def fextractURL(pdf_path):
@@ -19,12 +37,36 @@ def fextractURL(pdf_path):
19
  # If the URL ends with .pdf, use pdfplumber directly
20
  r = requests.get(pdf_path)
21
  f = io.BytesIO(r.content)
22
-
23
- doc = document.load_document(f)
24
- for q in ["What is the 2022 net income?", "What is the 2023 net income ?"]:
25
- extracted_data+= (q, p(question=q, **doc.context))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  except Exception as e:
27
- st.error(f"An error o0000ccurred: {str(e)}")
28
 
29
  return extracted_data
30
 
@@ -34,13 +76,28 @@ st.markdown(vert_space, unsafe_allow_html=True)
34
  st.write("Extract full text from PDF URL")
35
 
36
  pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
 
 
 
 
 
 
 
 
 
 
37
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
38
  extractedText = st.empty()
39
 
40
  if button:
41
  try:
42
  text = fextractURL(pdfURL)
43
- print(text)
44
- extractedText.text(text)
 
45
  except Exception as e:
46
- st.error(f"An error occurrrrred: {str(e)}")
 
 
 
 
 
1
  import streamlit as st
2
  import io
3
  import requests
4
+ import pdfplumber
5
+ import os
6
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
7
 
8
+ from groq import Groq
9
 
10
+ client = Groq(
11
+ api_key=os.getenv("groq_token"),
12
+ )
13
 
14
 
15
 
16
+ def AImodel(text,question):
17
+ chat_completion = client.chat.completions.create(
18
+ messages=[
19
+ {
20
+ "role": "user",
21
+ "content": f"extract {question} in this text:{text}",
22
+ }, {
23
+ "role": "system",
24
+ "content": "You are a helpful questioning/awnsering AI. Provide the bullet points exact answer from the provided text and do not generate new text."
25
+ },
26
+ ],
27
+ model="llama3-groq-8b-8192-tool-use-preview",
28
+ )
29
+ return chat_completion.choices[0].message.content
30
 
31
 
32
  def fextractURL(pdf_path):
 
37
  # If the URL ends with .pdf, use pdfplumber directly
38
  r = requests.get(pdf_path)
39
  f = io.BytesIO(r.content)
40
+ with pdfplumber.open(f) as pdf:
41
+ for page in pdf.pages:
42
+ extracted_data += page.extract_text() + "\n" # Extract text
43
+ tables = page.extract_tables() # Extract tables
44
+ for table in tables:
45
+ for row in table:
46
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
47
+ else:
48
+ # If the URL does not end with .pdf, download the PDF first
49
+ response = requests.get(pdf_path)
50
+ pdf_content = response.content
51
+
52
+ # Save the PDF locally
53
+ pdf_filename = 'downloaded_document.pdf'
54
+ with open(pdf_filename, 'wb') as pdf_file:
55
+ pdf_file.write(pdf_content)
56
+
57
+ # Extract content using pdfplumber
58
+ with pdfplumber.open(pdf_filename) as pdf:
59
+ for page in pdf.pages:
60
+ extracted_data += page.extract_text() + "\n" # Extract text
61
+ tables = page.extract_tables() # Extract tables
62
+ for table in tables:
63
+ for row in table:
64
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
65
+
66
+ # Delete the PDF file
67
+ os.remove(pdf_filename)
68
  except Exception as e:
69
+ st.error(f"An error occurred: {str(e)}")
70
 
71
  return extracted_data
72
 
 
76
  st.write("Extract full text from PDF URL")
77
 
78
  pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
79
+ questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:
80
+
81
+ Example Prompt:
82
+ *Extract items with the following details:
83
+
84
+ Invoice Number: xxxx
85
+ Date: [Insert Date format]
86
+ Customer Name:
87
+ Total Amount:
88
+ By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
89
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
90
  extractedText = st.empty()
91
 
92
  if button:
93
  try:
94
  text = fextractURL(pdfURL)
95
+ AItext = AImodel(text,questionText)
96
+ extractedText.text(AItext)
97
+
98
  except Exception as e:
99
+ st.error(f"An error occurred: {str(e)}")
100
+
101
+
102
+
103
+