Spaces:

AzizWazir
/

Free-Summarizer-Excel-Word-PDF

Sleeping

App Files Files Community

AzizWazir commited on Dec 28, 2024

Commit

2558e6a

verified ·

1 Parent(s): 3645bc3

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -44

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ import numpy as np
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 import requests
 # Initialize the summarization and question-answering models from Hugging Face
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
@@ -26,6 +29,22 @@ def extract_text_from_pdf(pdf_file):
         text += page.get_text()
     return text
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
     # Split the text into paragraphs or logical sections for better context
@@ -64,48 +83,57 @@ def call_groq_api(input_text):
         return f"Error: {str(e)}"
 # Streamlit UI
-st.title("RAG-based PDF Summarizer and Q&A")
-# Upload PDF file
-pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
-if pdf_file:
-    # Extract text from the uploaded PDF
-    text = extract_text_from_pdf(pdf_file)
-    st.write("Text extracted from PDF:")
-    st.write(text[:500])  # Show first 500 characters of extracted text
-    # Create FAISS index from the extracted text
-    index, paragraphs = create_faiss_index(text)
-    # Input for user query
-    query = st.text_input("Enter your query:")
-    if query:
-        st.write("Retrieving relevant information...")
-        relevant_chunk = retrieve_relevant_chunk(query, index, paragraphs)
-        st.write(f"Relevant Text: {relevant_chunk}")
-        # Answer the question based on the relevant chunk
-        st.write("Answering the question...")
-        answer = qa_pipeline(question=query, context=relevant_chunk)  # Use question-answering pipeline
-        st.write(f"Answer: {answer['answer']}")
-        # Summarize the relevant chunk, but check if it's empty or too short
-        if relevant_chunk.strip():
-            # Ensure it's long enough for summarization (avoid too short text)
-            if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
-                try:
-                    st.write("Summarizing...")
-                    summary = summarizer(relevant_chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
-                    st.write(f"Summary: {summary}")
-                except Exception as e:
-                    st.write(f"Error summarizing text: {str(e)}")
-            else:
-                st.write("Text is too short to summarize effectively.")
-        else:
-            st.write("No relevant text found to summarize.")
-        # Optionally, summarize using Groq API
-        # summary = call_groq_api(relevant_chunk)
-        # st.write(f"Groq Summary: {summary}")

 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 import requests
+from io import BytesIO
+import docx
+import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         text += page.get_text()
     return text
+# MS Word Processing Function
+def extract_text_from_word(word_file):
+    doc = docx.Document(BytesIO(word_file.read()))
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text
+# Excel File Processing Function
+def extract_text_from_excel(excel_file):
+    df = pd.read_excel(excel_file, engine="openpyxl")  # Use openpyxl to read .xlsx files
+    text = ""
+    for col in df.columns:
+        text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"  # Join values in each column
+    return text
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
     # Split the text into paragraphs or logical sections for better context
         return f"Error: {str(e)}"
 # Streamlit UI
+st.title("RAG-based PDF, Word, Excel Summarizer and Q&A")
+# Upload File
+uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
+if uploaded_file:
+    file_type = uploaded_file.type  # Get the MIME type of the uploaded file
+    # Extract text based on file type
+    if file_type == "application/pdf":
+        text = extract_text_from_pdf(uploaded_file)
+    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        text = extract_text_from_word(uploaded_file)
+    elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        text = extract_text_from_excel(uploaded_file)
+    else:
+        st.error("Unsupported file type!")
+        text = ""
+    if text:
+        # Show extracted text (first 500 characters)
+        st.write("Text extracted from file:")
+        st.write(text[:500])  # Show first 500 characters of extracted text
+        # Create FAISS index from the extracted text
+        index, paragraphs = create_faiss_index(text)
+        # Input for user query
+        query = st.text_input("Enter your query:")
+        if query:
+            st.write("Retrieving relevant information...")
+            relevant_chunk = retrieve_relevant_chunk(query, index, paragraphs)
+            st.write(f"Relevant Text: {relevant_chunk}")
+            # Answer the question based on the relevant chunk
+            st.write("Answering the question...")
+            answer = qa_pipeline(question=query, context=relevant_chunk)  # Use question-answering pipeline
+            st.write(f"Answer: {answer['answer']}")
+            # Summarize the relevant chunk
+            if relevant_chunk.strip():
+                # Ensure it's long enough for summarization (avoid too short text)
+                if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
+                    try:
+                        st.write("Summarizing...")
+                        summary = summarizer(relevant_chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
+                        st.write(f"Summary: {summary}")
+                    except Exception as e:
+                        st.write(f"Error summarizing text: {str(e)}")
+                else:
+                    st.write("Text is too short to summarize effectively.")
+            else:
+                st.write("No relevant text found to summarize.")