Spaces:

dimoZ
/

chat.pdf

Build error

App Files Files Community

dimoZ commited on Dec 5, 2024

Commit

1937bf2

verified ·

1 Parent(s): 644e03f

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -2

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 from fuzzywuzzy import process
 # Load environment variables
 load_dotenv()
@@ -45,11 +47,32 @@ def extract_text_from_pdf(pdf_docs):
 # Function to extract text from .docx
 def extract_text_from_docx(docx_docs):
     text = ""
     for doc in docx_docs:
         document = Document(doc)
         for para in document.paragraphs:
             text += para.text + "\n"
-    return text
 # Function to split text into chunks
 def split_text_into_chunks(text):
@@ -105,11 +128,20 @@ def main():
             if pdf_docs or docx_docs:
                 st.spinner("Processing...")
                 pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
-                docx_text = extract_text_from_docx(docx_docs) if docx_docs else ""
                 combined_text = pdf_text + docx_text
                 text_chunks = split_text_into_chunks(combined_text)
                 create_vector_store(text_chunks)
                 st.success("Documents processed successfully!")
             else:
                 st.error("Please upload at least one document.")

 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 from fuzzywuzzy import process
+import base64
+from io import BytesIO
 # Load environment variables
 load_dotenv()
 # Function to extract text from .docx
 def extract_text_from_docx(docx_docs):
     text = ""
+    tables = []
+    images = []
     for doc in docx_docs:
         document = Document(doc)
+        # Extract text
         for para in document.paragraphs:
             text += para.text + "\n"
+        # Extract tables
+        for table in document.tables:
+            table_text = ""
+            for row in table.rows:
+                row_text = [cell.text for cell in row.cells]
+                table_text += " | ".join(row_text) + "\n"
+            tables.append(table_text)
+        # Extract images (figures)
+        for rel in document.part.rels.values():
+            if "image" in rel.target_ref:
+                img = rel.target_part
+                img_data = img.blob
+                img_b64 = base64.b64encode(img_data).decode("utf-8")
+                images.append(f"data:image/png;base64,{img_b64}")  # Storing image as base64
+    return text, tables, images
 # Function to split text into chunks
 def split_text_into_chunks(text):
             if pdf_docs or docx_docs:
                 st.spinner("Processing...")
                 pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
+                docx_text, tables, images = extract_text_from_docx(docx_docs) if docx_docs else ("", [], [])
                 combined_text = pdf_text + docx_text
                 text_chunks = split_text_into_chunks(combined_text)
                 create_vector_store(text_chunks)
                 st.success("Documents processed successfully!")
+                # Optionally display tables and images
+                st.subheader("Tables Extracted:")
+                for table in tables:
+                    st.write(table)
+                st.subheader("Figures/Images Extracted:")
+                for img in images:
+                    st.image(img)  # Display base64 image
             else:
                 st.error("Please upload at least one document.")