dimoZ commited on
Commit
1937bf2
·
verified ·
1 Parent(s): 644e03f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -2
app.py CHANGED
@@ -10,6 +10,8 @@ from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
12
  from fuzzywuzzy import process
 
 
13
 
14
  # Load environment variables
15
  load_dotenv()
@@ -45,11 +47,32 @@ def extract_text_from_pdf(pdf_docs):
45
  # Function to extract text from .docx
46
  def extract_text_from_docx(docx_docs):
47
  text = ""
 
 
48
  for doc in docx_docs:
49
  document = Document(doc)
 
 
50
  for para in document.paragraphs:
51
  text += para.text + "\n"
52
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Function to split text into chunks
55
  def split_text_into_chunks(text):
@@ -105,11 +128,20 @@ def main():
105
  if pdf_docs or docx_docs:
106
  st.spinner("Processing...")
107
  pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
108
- docx_text = extract_text_from_docx(docx_docs) if docx_docs else ""
109
  combined_text = pdf_text + docx_text
110
  text_chunks = split_text_into_chunks(combined_text)
111
  create_vector_store(text_chunks)
112
  st.success("Documents processed successfully!")
 
 
 
 
 
 
 
 
 
113
  else:
114
  st.error("Please upload at least one document.")
115
 
 
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
12
  from fuzzywuzzy import process
13
+ import base64
14
+ from io import BytesIO
15
 
16
  # Load environment variables
17
  load_dotenv()
 
47
  # Function to extract text from .docx
48
  def extract_text_from_docx(docx_docs):
49
  text = ""
50
+ tables = []
51
+ images = []
52
  for doc in docx_docs:
53
  document = Document(doc)
54
+
55
+ # Extract text
56
  for para in document.paragraphs:
57
  text += para.text + "\n"
58
+
59
+ # Extract tables
60
+ for table in document.tables:
61
+ table_text = ""
62
+ for row in table.rows:
63
+ row_text = [cell.text for cell in row.cells]
64
+ table_text += " | ".join(row_text) + "\n"
65
+ tables.append(table_text)
66
+
67
+ # Extract images (figures)
68
+ for rel in document.part.rels.values():
69
+ if "image" in rel.target_ref:
70
+ img = rel.target_part
71
+ img_data = img.blob
72
+ img_b64 = base64.b64encode(img_data).decode("utf-8")
73
+ images.append(f"data:image/png;base64,{img_b64}") # Storing image as base64
74
+
75
+ return text, tables, images
76
 
77
  # Function to split text into chunks
78
  def split_text_into_chunks(text):
 
128
  if pdf_docs or docx_docs:
129
  st.spinner("Processing...")
130
  pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
131
+ docx_text, tables, images = extract_text_from_docx(docx_docs) if docx_docs else ("", [], [])
132
  combined_text = pdf_text + docx_text
133
  text_chunks = split_text_into_chunks(combined_text)
134
  create_vector_store(text_chunks)
135
  st.success("Documents processed successfully!")
136
+
137
+ # Optionally display tables and images
138
+ st.subheader("Tables Extracted:")
139
+ for table in tables:
140
+ st.write(table)
141
+
142
+ st.subheader("Figures/Images Extracted:")
143
+ for img in images:
144
+ st.image(img) # Display base64 image
145
  else:
146
  st.error("Please upload at least one document.")
147