Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,8 @@ from langchain.chains.question_answering import load_qa_chain
|
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
from fuzzywuzzy import process
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Load environment variables
|
| 15 |
load_dotenv()
|
|
@@ -45,11 +47,32 @@ def extract_text_from_pdf(pdf_docs):
|
|
| 45 |
# Function to extract text from .docx
|
| 46 |
def extract_text_from_docx(docx_docs):
|
| 47 |
text = ""
|
|
|
|
|
|
|
| 48 |
for doc in docx_docs:
|
| 49 |
document = Document(doc)
|
|
|
|
|
|
|
| 50 |
for para in document.paragraphs:
|
| 51 |
text += para.text + "\n"
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Function to split text into chunks
|
| 55 |
def split_text_into_chunks(text):
|
|
@@ -105,11 +128,20 @@ def main():
|
|
| 105 |
if pdf_docs or docx_docs:
|
| 106 |
st.spinner("Processing...")
|
| 107 |
pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
|
| 108 |
-
docx_text = extract_text_from_docx(docx_docs) if docx_docs else ""
|
| 109 |
combined_text = pdf_text + docx_text
|
| 110 |
text_chunks = split_text_into_chunks(combined_text)
|
| 111 |
create_vector_store(text_chunks)
|
| 112 |
st.success("Documents processed successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
else:
|
| 114 |
st.error("Please upload at least one document.")
|
| 115 |
|
|
|
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
from fuzzywuzzy import process
|
| 13 |
+
import base64
|
| 14 |
+
from io import BytesIO
|
| 15 |
|
| 16 |
# Load environment variables
|
| 17 |
load_dotenv()
|
|
|
|
| 47 |
# Function to extract text from .docx
|
| 48 |
def extract_text_from_docx(docx_docs):
|
| 49 |
text = ""
|
| 50 |
+
tables = []
|
| 51 |
+
images = []
|
| 52 |
for doc in docx_docs:
|
| 53 |
document = Document(doc)
|
| 54 |
+
|
| 55 |
+
# Extract text
|
| 56 |
for para in document.paragraphs:
|
| 57 |
text += para.text + "\n"
|
| 58 |
+
|
| 59 |
+
# Extract tables
|
| 60 |
+
for table in document.tables:
|
| 61 |
+
table_text = ""
|
| 62 |
+
for row in table.rows:
|
| 63 |
+
row_text = [cell.text for cell in row.cells]
|
| 64 |
+
table_text += " | ".join(row_text) + "\n"
|
| 65 |
+
tables.append(table_text)
|
| 66 |
+
|
| 67 |
+
# Extract images (figures)
|
| 68 |
+
for rel in document.part.rels.values():
|
| 69 |
+
if "image" in rel.target_ref:
|
| 70 |
+
img = rel.target_part
|
| 71 |
+
img_data = img.blob
|
| 72 |
+
img_b64 = base64.b64encode(img_data).decode("utf-8")
|
| 73 |
+
images.append(f"data:image/png;base64,{img_b64}") # Storing image as base64
|
| 74 |
+
|
| 75 |
+
return text, tables, images
|
| 76 |
|
| 77 |
# Function to split text into chunks
|
| 78 |
def split_text_into_chunks(text):
|
|
|
|
| 128 |
if pdf_docs or docx_docs:
|
| 129 |
st.spinner("Processing...")
|
| 130 |
pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
|
| 131 |
+
docx_text, tables, images = extract_text_from_docx(docx_docs) if docx_docs else ("", [], [])
|
| 132 |
combined_text = pdf_text + docx_text
|
| 133 |
text_chunks = split_text_into_chunks(combined_text)
|
| 134 |
create_vector_store(text_chunks)
|
| 135 |
st.success("Documents processed successfully!")
|
| 136 |
+
|
| 137 |
+
# Optionally display tables and images
|
| 138 |
+
st.subheader("Tables Extracted:")
|
| 139 |
+
for table in tables:
|
| 140 |
+
st.write(table)
|
| 141 |
+
|
| 142 |
+
st.subheader("Figures/Images Extracted:")
|
| 143 |
+
for img in images:
|
| 144 |
+
st.image(img) # Display base64 image
|
| 145 |
else:
|
| 146 |
st.error("Please upload at least one document.")
|
| 147 |
|