File size: 4,559 Bytes
2ce24b4 5492263 1f0bfc6 2ce24b4 2558e6a 2ce24b4 364de8d d9e1c6d 1f0bfc6 d9e1c6d 2ce24b4 d9e1c6d 2ce24b4 d9e1c6d 2ce24b4 d9e1c6d 2ce24b4 1b61846 61ee9be 2558e6a d9e1c6d 2558e6a 1b61846 2558e6a 1b61846 2558e6a 1a419f4 1b61846 2558e6a 1a419f4 2380f79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import streamlit as st
import fitz # PyMuPDF for PDF extraction
import faiss
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import requests
from io import BytesIO
import docx
import pandas as pd
# Initialize the summarization and question-answering models from Hugging Face
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
# Sentence Transformer for embedding-based retrieval
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# FAISS Indexing Function with sentence-level retrieval
def create_faiss_index(text):
sentences = text.split(". ") # Split into sentences using "." as delimiter
embeddings = embedder.encode(sentences)
index = faiss.IndexFlatL2(384)
index.add(np.array(embeddings).astype(np.float32))
return index, sentences
# Function to retrieve the most relevant sentences
def retrieve_relevant_sentences(query, index, sentences):
query_embedding = embedder.encode([query])
D, I = index.search(np.array(query_embedding).astype(np.float32), 5) # Retrieve top 5 most similar sentences
relevant_sentences = [sentences[i] for i in I[0]]
return relevant_sentences
# Function to filter retrieved sentences based on keywords
def filter_sentences(query, sentences):
filtered_sentences = []
for sentence in sentences:
if any(word.lower() in sentence.lower() for word in query.split()):
filtered_sentences.append(sentence)
return filtered_sentences
# Streamlit UI
st.title("Concise Summarizer and Q&A")
# Upload File
uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
if uploaded_file:
file_type = uploaded_file.type
# Extract text based on file type
if file_type == "application/pdf":
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(BytesIO(uploaded_file.read()))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
df = pd.read_excel(uploaded_file, engine="openpyxl")
text = ""
for col in df.columns:
text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
else:
st.error("Unsupported file type!")
text = ""
if text:
# Display the first 500 characters of extracted text
st.write("Text extracted from file:")
st.write(text[:500]) # Show first 500 characters
# Create FAISS index
index, sentences = create_faiss_index(text)
# Input for user query
query = st.text_input("Enter your query:")
if query:
st.write("Retrieving relevant information...")
relevant_sentences = retrieve_relevant_sentences(query, index, sentences)
filtered_sentences = filter_sentences(query, relevant_sentences)
# Combine filtered sentences into a single string
relevant_text = " ".join(filtered_sentences)
st.write(f"Relevant Text: {relevant_text}")
# Answer the question based on the relevant chunk
st.write("Answering the question...")
try:
answer = qa_pipeline(question=query, context=relevant_text)
concise_answer = answer['answer']
st.write(f"Answer: {concise_answer}")
except Exception as e:
st.write(f"Error answering question: {str(e)}")
# Summarize the relevant chunk (concise summary after query answer)
if relevant_text.strip():
if len(relevant_text.split()) > 20: # Only summarize if text is long enough
try:
st.write("Summarizing...")
summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
st.write(f"Summary: {summary}")
except Exception as e:
st.write(f"Error summarizing text: {str(e)}")
else:
st.write("Text is too short to summarize.")
else:
st.write("No relevant text found to summarize.") |