AzizWazir's picture
Update app.py
2380f79 verified
import streamlit as st
import fitz # PyMuPDF for PDF extraction
import faiss
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import requests
from io import BytesIO
import docx
import pandas as pd
# Initialize the summarization and question-answering models from Hugging Face
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
# Sentence Transformer for embedding-based retrieval
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# FAISS Indexing Function with sentence-level retrieval
def create_faiss_index(text):
sentences = text.split(". ") # Split into sentences using "." as delimiter
embeddings = embedder.encode(sentences)
index = faiss.IndexFlatL2(384)
index.add(np.array(embeddings).astype(np.float32))
return index, sentences
# Function to retrieve the most relevant sentences
def retrieve_relevant_sentences(query, index, sentences):
query_embedding = embedder.encode([query])
D, I = index.search(np.array(query_embedding).astype(np.float32), 5) # Retrieve top 5 most similar sentences
relevant_sentences = [sentences[i] for i in I[0]]
return relevant_sentences
# Function to filter retrieved sentences based on keywords
def filter_sentences(query, sentences):
filtered_sentences = []
for sentence in sentences:
if any(word.lower() in sentence.lower() for word in query.split()):
filtered_sentences.append(sentence)
return filtered_sentences
# Streamlit UI
st.title("Concise Summarizer and Q&A")
# Upload File
uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
if uploaded_file:
file_type = uploaded_file.type
# Extract text based on file type
if file_type == "application/pdf":
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(BytesIO(uploaded_file.read()))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
df = pd.read_excel(uploaded_file, engine="openpyxl")
text = ""
for col in df.columns:
text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
else:
st.error("Unsupported file type!")
text = ""
if text:
# Display the first 500 characters of extracted text
st.write("Text extracted from file:")
st.write(text[:500]) # Show first 500 characters
# Create FAISS index
index, sentences = create_faiss_index(text)
# Input for user query
query = st.text_input("Enter your query:")
if query:
st.write("Retrieving relevant information...")
relevant_sentences = retrieve_relevant_sentences(query, index, sentences)
filtered_sentences = filter_sentences(query, relevant_sentences)
# Combine filtered sentences into a single string
relevant_text = " ".join(filtered_sentences)
st.write(f"Relevant Text: {relevant_text}")
# Answer the question based on the relevant chunk
st.write("Answering the question...")
try:
answer = qa_pipeline(question=query, context=relevant_text)
concise_answer = answer['answer']
st.write(f"Answer: {concise_answer}")
except Exception as e:
st.write(f"Error answering question: {str(e)}")
# Summarize the relevant chunk (concise summary after query answer)
if relevant_text.strip():
if len(relevant_text.split()) > 20: # Only summarize if text is long enough
try:
st.write("Summarizing...")
summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
st.write(f"Summary: {summary}")
except Exception as e:
st.write(f"Error summarizing text: {str(e)}")
else:
st.write("Text is too short to summarize.")
else:
st.write("No relevant text found to summarize.")