Spaces:

AzizWazir
/

Free-Summarizer-Excel-Word-PDF

Sleeping

App Files Files Community

AzizWazir commited on Dec 28, 2024

Commit

1b61846

verified ·

1 Parent(s): 5846262

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -54

app.py CHANGED Viewed

@@ -10,40 +10,12 @@ import docx
 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=100)  # Concise summary settings
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # Better QA model for concise answers
 # Sentence Transformer for embedding-based retrieval
 embedder = SentenceTransformer('all-MiniLM-L6-v2')  # A compact and efficient embedding model
-# Groq API Configuration
-API_KEY = "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"  # Replace with your actual Groq API key
-API_URL = "https://api.groq.com/openai/v1/chat/completions"  # Default endpoint for chat completions (check Groq docs)
-# PDF Processing Function
-def extract_text_from_pdf(pdf_file):
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return text
-# MS Word Processing Function
-def extract_text_from_word(word_file):
-    doc = docx.Document(BytesIO(word_file.read()))
-    text = ""
-    for para in doc.paragraphs:
-        text += para.text + "\n"
-    return text
-# Excel File Processing Function
-def extract_text_from_excel(excel_file):
-    df = pd.read_excel(excel_file, engine="openpyxl")  # Use openpyxl to read .xlsx files
-    text = ""
-    for col in df.columns:
-        text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"  # Join values in each column
-    return text
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
     paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
@@ -58,28 +30,8 @@ def retrieve_relevant_chunk(query, index, paragraphs):
     D, I = index.search(np.array(query_embedding).astype(np.float32), 1)  # Search for the closest chunk
     return paragraphs[I[0][0]]  # Return the most relevant paragraph
-# Function to call Groq API for summarization (optional)
-def call_groq_api(input_text):
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json"
-    }
-    payload = {
-        "model": "gpt-3.5-turbo",  # Use the correct model if available
-        "messages": [{"role": "user", "content": input_text}],
-        "n": 1
-    }
-    try:
-        response = requests.post(API_URL, json=payload, headers=headers)
-        if response.status_code == 200:
-            return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No result found")
-        else:
-            return f"Error: {response.status_code} - {response.text}"
-    except Exception as e:
-        return f"Error: {str(e)}"
 # Streamlit UI
-st.title("RAG-based PDF, Word, Excel Summarizer and Q&A")
 # Upload File
 uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
@@ -89,11 +41,20 @@ if uploaded_file:
     # Extract text based on file type
     if file_type == "application/pdf":
-        text = extract_text_from_pdf(uploaded_file)
     elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-        text = extract_text_from_word(uploaded_file)
     elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
-        text = extract_text_from_excel(uploaded_file)
     else:
         st.error("Unsupported file type!")
         text = ""
@@ -124,7 +85,7 @@ if uploaded_file:
                 if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
                     try:
                         st.write("Summarizing...")
-                        summary = summarizer(relevant_chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")

 import pandas as pd
 # Initialize the summarization and question-answering models from Hugging Face
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=80)  # Concise summary settings
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")  # Better QA model for concise answers
 # Sentence Transformer for embedding-based retrieval
 embedder = SentenceTransformer('all-MiniLM-L6-v2')  # A compact and efficient embedding model
 # FAISS Indexing Function with better embedding-based chunking
 def create_faiss_index(text):
     paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by double newlines
     D, I = index.search(np.array(query_embedding).astype(np.float32), 1)  # Search for the closest chunk
     return paragraphs[I[0][0]]  # Return the most relevant paragraph
 # Streamlit UI
+st.title("Concise Summarizer and Q&A")
 # Upload File
 uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
     # Extract text based on file type
     if file_type == "application/pdf":
+        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
     elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        doc = docx.Document(BytesIO(uploaded_file.read()))
+        text = ""
+        for para in doc.paragraphs:
+            text += para.text + "\n"
     elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        df = pd.read_excel(uploaded_file, engine="openpyxl")  # Use openpyxl to read .xlsx files
+        text = ""
+        for col in df.columns:
+            text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
     else:
         st.error("Unsupported file type!")
         text = ""
                 if len(relevant_chunk.split()) > 20:  # Only summarize if the text is sufficiently long
                     try:
                         st.write("Summarizing...")
+                        summary = summarizer(relevant_chunk, max_length=80, min_length=50, do_sample=False)[0]['summary_text']
                         st.write(f"Summary: {summary}")
                     except Exception as e:
                         st.write(f"Error summarizing text: {str(e)}")