Spaces:

AzizWazir
/

Free-Summarizer-Excel-Word-PDF

Sleeping

App Files Files Community

AzizWazir commited on Dec 28, 2024

Commit

5846262

verified ·

1 Parent(s): 2558e6a

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -9

app.py CHANGED Viewed

@@ -10,8 +10,8 @@ import docx
 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # Better QA model
 # Sentence Transformer for embedding-based retrieval
 embedder = SentenceTransformer('all-MiniLM-L6-v2')  # A compact and efficient embedding model
@@ -22,7 +22,6 @@ API_URL = "https://api.groq.com/openai/v1/chat/completions"  # Default endpoint
 # PDF Processing Function
 def extract_text_from_pdf(pdf_file):
-    # Read the file from the uploaded BytesIO object
     doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
     text = ""
     for page in doc:
@@ -47,11 +46,8 @@ def extract_text_from_excel(excel_file):
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
-    # Split the text into paragraphs or logical sections for better context
     paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
     embeddings = embedder.encode(paragraphs)  # Use sentence-transformers to create embeddings
-    # Create a FAISS index
     index = faiss.IndexFlatL2(384)  # We use 384-dimensional embeddings (from the chosen model)
     index.add(np.array(embeddings).astype(np.float32))  # Add the embeddings to the FAISS index
     return index, paragraphs
@@ -123,13 +119,12 @@ if uploaded_file:
             answer = qa_pipeline(question=query, context=relevant_chunk)  # Use question-answering pipeline
             st.write(f"Answer: {answer['answer']}")
-            # Summarize the relevant chunk
             if relevant_chunk.strip():
-                # Ensure it's long enough for summarization (avoid too short text)
                 if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
                     try:
                         st.write("Summarizing...")
-                        summary = summarizer(relevant_chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")

 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=100)  # Concise summary settings
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # Better QA model for concise answers
 # Sentence Transformer for embedding-based retrieval
 embedder = SentenceTransformer('all-MiniLM-L6-v2')  # A compact and efficient embedding model
 # PDF Processing Function
 def extract_text_from_pdf(pdf_file):
     doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
     text = ""
     for page in doc:
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
     paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
     embeddings = embedder.encode(paragraphs)  # Use sentence-transformers to create embeddings
     index = faiss.IndexFlatL2(384)  # We use 384-dimensional embeddings (from the chosen model)
     index.add(np.array(embeddings).astype(np.float32))  # Add the embeddings to the FAISS index
     return index, paragraphs
             answer = qa_pipeline(question=query, context=relevant_chunk)  # Use question-answering pipeline
             st.write(f"Answer: {answer['answer']}")
+            # Summarize the relevant chunk (display summary after answering the query)
             if relevant_chunk.strip():
                 if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
                     try:
                         st.write("Summarizing...")
+                        summary = summarizer(relevant_chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")