import streamlit as st import fitz # PyMuPDF for PDF extraction import faiss import numpy as np from transformers import pipeline from sentence_transformers import SentenceTransformer import requests from io import BytesIO import docx import pandas as pd # Initialize the summarization and question-answering models from Hugging Face summarizer = pipeline("summarization", model="facebook/bart-large-cnn") qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Sentence Transformer for embedding-based retrieval embedder = SentenceTransformer('all-MiniLM-L6-v2') # FAISS Indexing Function with sentence-level retrieval def create_faiss_index(text): sentences = text.split(". ") # Split into sentences using "." as delimiter embeddings = embedder.encode(sentences) index = faiss.IndexFlatL2(384) index.add(np.array(embeddings).astype(np.float32)) return index, sentences # Function to retrieve the most relevant sentences def retrieve_relevant_sentences(query, index, sentences): query_embedding = embedder.encode([query]) D, I = index.search(np.array(query_embedding).astype(np.float32), 5) # Retrieve top 5 most similar sentences relevant_sentences = [sentences[i] for i in I[0]] return relevant_sentences # Function to filter retrieved sentences based on keywords def filter_sentences(query, sentences): filtered_sentences = [] for sentence in sentences: if any(word.lower() in sentence.lower() for word in query.split()): filtered_sentences.append(sentence) return filtered_sentences # Streamlit UI st.title("Concise Summarizer and Q&A") # Upload File uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"]) if uploaded_file: file_type = uploaded_file.type # Extract text based on file type if file_type == "application/pdf": doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = docx.Document(BytesIO(uploaded_file.read())) text = "" for para in doc.paragraphs: text += para.text + "\n" elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": df = pd.read_excel(uploaded_file, engine="openpyxl") text = "" for col in df.columns: text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n" else: st.error("Unsupported file type!") text = "" if text: # Display the first 500 characters of extracted text st.write("Text extracted from file:") st.write(text[:500]) # Show first 500 characters # Create FAISS index index, sentences = create_faiss_index(text) # Input for user query query = st.text_input("Enter your query:") if query: st.write("Retrieving relevant information...") relevant_sentences = retrieve_relevant_sentences(query, index, sentences) filtered_sentences = filter_sentences(query, relevant_sentences) # Combine filtered sentences into a single string relevant_text = " ".join(filtered_sentences) st.write(f"Relevant Text: {relevant_text}") # Answer the question based on the relevant chunk st.write("Answering the question...") try: answer = qa_pipeline(question=query, context=relevant_text) concise_answer = answer['answer'] st.write(f"Answer: {concise_answer}") except Exception as e: st.write(f"Error answering question: {str(e)}") # Summarize the relevant chunk (concise summary after query answer) if relevant_text.strip(): if len(relevant_text.split()) > 20: # Only summarize if text is long enough try: st.write("Summarizing...") summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text'] st.write(f"Summary: {summary}") except Exception as e: st.write(f"Error summarizing text: {str(e)}") else: st.write("Text is too short to summarize.") else: st.write("No relevant text found to summarize.")