Spaces:

AzizWazir
/

Free-Summarizer-Excel-Word-PDF

Running

App Files Files Community

AzizWazir commited on Dec 28, 2024

Commit

1a419f4

verified ·

1 Parent(s): 1b61846

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -20

app.py CHANGED Viewed

@@ -10,25 +10,25 @@ import docx
 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=80)  # Concise summary settings
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # Better QA model for concise answers
 # Sentence Transformer for embedding-based retrieval
-embedder = SentenceTransformer('all-MiniLM-L6-v2')  # A compact and efficient embedding model
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
-    paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
     embeddings = embedder.encode(paragraphs)  # Use sentence-transformers to create embeddings
-    index = faiss.IndexFlatL2(384)  # We use 384-dimensional embeddings (from the chosen model)
-    index.add(np.array(embeddings).astype(np.float32))  # Add the embeddings to the FAISS index
     return index, paragraphs
 # Function to retrieve the most relevant chunk from the index
 def retrieve_relevant_chunk(query, index, paragraphs):
-    query_embedding = embedder.encode([query])  # Use sentence transformer to embed the query
-    D, I = index.search(np.array(query_embedding).astype(np.float32), 1)  # Search for the closest chunk
-    return paragraphs[I[0][0]]  # Return the most relevant paragraph
 # Streamlit UI
 st.title("Concise Summarizer and Q&A")
@@ -37,7 +37,7 @@ st.title("Concise Summarizer and Q&A")
 uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
 if uploaded_file:
-    file_type = uploaded_file.type  # Get the MIME type of the uploaded file
     # Extract text based on file type
     if file_type == "application/pdf":
@@ -51,7 +51,7 @@ if uploaded_file:
         for para in doc.paragraphs:
             text += para.text + "\n"
     elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
-        df = pd.read_excel(uploaded_file, engine="openpyxl")  # Use openpyxl to read .xlsx files
         text = ""
         for col in df.columns:
             text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
@@ -60,11 +60,11 @@ if uploaded_file:
         text = ""
     if text:
-        # Show extracted text (first 500 characters)
         st.write("Text extracted from file:")
-        st.write(text[:500])  # Show first 500 characters of extracted text
-        # Create FAISS index from the extracted text
         index, paragraphs = create_faiss_index(text)
         # Input for user query
@@ -77,19 +77,20 @@ if uploaded_file:
             # Answer the question based on the relevant chunk
             st.write("Answering the question...")
-            answer = qa_pipeline(question=query, context=relevant_chunk)  # Use question-answering pipeline
-            st.write(f"Answer: {answer['answer']}")
-            # Summarize the relevant chunk (display summary after answering the query)
             if relevant_chunk.strip():
-                if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
                     try:
                         st.write("Summarizing...")
-                        summary = summarizer(relevant_chunk, max_length=80, min_length=50, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")
                 else:
-                    st.write("Text is too short to summarize effectively.")
             else:
                 st.write("No relevant text found to summarize.")

 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", max_length=50, min_length=30, do_sample=False)  # Concise summary settings
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # More focused QA model
 # Sentence Transformer for embedding-based retrieval
+embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and compact embeddings
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
+    paragraphs = text.split('\n\n')  # Split into paragraphs by double newlines
     embeddings = embedder.encode(paragraphs)  # Use sentence-transformers to create embeddings
+    index = faiss.IndexFlatL2(384)  # Using 384-dimensional embeddings
+    index.add(np.array(embeddings).astype(np.float32))  # Add embeddings to the FAISS index
     return index, paragraphs
 # Function to retrieve the most relevant chunk from the index
 def retrieve_relevant_chunk(query, index, paragraphs):
+    query_embedding = embedder.encode([query])  # Get embedding for the query
+    D, I = index.search(np.array(query_embedding).astype(np.float32), 1)  # Search for closest match
+    return paragraphs[I[0][0]]  # Return the relevant paragraph
 # Streamlit UI
 st.title("Concise Summarizer and Q&A")
 uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
 if uploaded_file:
+    file_type = uploaded_file.type  # Get MIME type
     # Extract text based on file type
     if file_type == "application/pdf":
         for para in doc.paragraphs:
             text += para.text + "\n"
     elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        df = pd.read_excel(uploaded_file, engine="openpyxl")
         text = ""
         for col in df.columns:
             text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
         text = ""
     if text:
+        # Display the first 500 characters of extracted text
         st.write("Text extracted from file:")
+        st.write(text[:500])  # Show first 500 characters
+        # Create FAISS index
         index, paragraphs = create_faiss_index(text)
         # Input for user query
             # Answer the question based on the relevant chunk
             st.write("Answering the question...")
+            answer = qa_pipeline(question=query, context=relevant_chunk)
+            concise_answer = answer['answer'][:300]  # Limiting answer to 300 characters (short answer)
+            st.write(f"Answer: {concise_answer}")
+            # Summarize the relevant chunk (concise summary after query answer)
             if relevant_chunk.strip():
+                if len(relevant_chunk.split()) > 20:  # Only summarize if text is long enough
                     try:
                         st.write("Summarizing...")
+                        summary = summarizer(relevant_chunk, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")
                 else:
+                    st.write("Text is too short to summarize.")
             else:
                 st.write("No relevant text found to summarize.")