|
|
import streamlit as st |
|
|
import fitz |
|
|
import faiss |
|
|
import numpy as np |
|
|
from transformers import pipeline |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import requests |
|
|
from io import BytesIO |
|
|
import docx |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
|
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") |
|
|
|
|
|
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
def create_faiss_index(text): |
|
|
sentences = text.split(". ") |
|
|
embeddings = embedder.encode(sentences) |
|
|
index = faiss.IndexFlatL2(384) |
|
|
index.add(np.array(embeddings).astype(np.float32)) |
|
|
return index, sentences |
|
|
|
|
|
|
|
|
def retrieve_relevant_sentences(query, index, sentences): |
|
|
query_embedding = embedder.encode([query]) |
|
|
D, I = index.search(np.array(query_embedding).astype(np.float32), 5) |
|
|
relevant_sentences = [sentences[i] for i in I[0]] |
|
|
return relevant_sentences |
|
|
|
|
|
|
|
|
def filter_sentences(query, sentences): |
|
|
filtered_sentences = [] |
|
|
for sentence in sentences: |
|
|
if any(word.lower() in sentence.lower() for word in query.split()): |
|
|
filtered_sentences.append(sentence) |
|
|
return filtered_sentences |
|
|
|
|
|
|
|
|
st.title("Concise Summarizer and Q&A") |
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"]) |
|
|
|
|
|
if uploaded_file: |
|
|
file_type = uploaded_file.type |
|
|
|
|
|
|
|
|
if file_type == "application/pdf": |
|
|
doc = fitz.open(stream=uploaded_file.read(), filetype="pdf") |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": |
|
|
doc = docx.Document(BytesIO(uploaded_file.read())) |
|
|
text = "" |
|
|
for para in doc.paragraphs: |
|
|
text += para.text + "\n" |
|
|
elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": |
|
|
df = pd.read_excel(uploaded_file, engine="openpyxl") |
|
|
text = "" |
|
|
for col in df.columns: |
|
|
text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n" |
|
|
else: |
|
|
st.error("Unsupported file type!") |
|
|
text = "" |
|
|
|
|
|
if text: |
|
|
|
|
|
st.write("Text extracted from file:") |
|
|
st.write(text[:500]) |
|
|
|
|
|
|
|
|
index, sentences = create_faiss_index(text) |
|
|
|
|
|
|
|
|
query = st.text_input("Enter your query:") |
|
|
|
|
|
if query: |
|
|
st.write("Retrieving relevant information...") |
|
|
relevant_sentences = retrieve_relevant_sentences(query, index, sentences) |
|
|
filtered_sentences = filter_sentences(query, relevant_sentences) |
|
|
|
|
|
|
|
|
relevant_text = " ".join(filtered_sentences) |
|
|
|
|
|
st.write(f"Relevant Text: {relevant_text}") |
|
|
|
|
|
|
|
|
st.write("Answering the question...") |
|
|
try: |
|
|
answer = qa_pipeline(question=query, context=relevant_text) |
|
|
concise_answer = answer['answer'] |
|
|
st.write(f"Answer: {concise_answer}") |
|
|
except Exception as e: |
|
|
st.write(f"Error answering question: {str(e)}") |
|
|
|
|
|
|
|
|
if relevant_text.strip(): |
|
|
if len(relevant_text.split()) > 20: |
|
|
try: |
|
|
st.write("Summarizing...") |
|
|
summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text'] |
|
|
st.write(f"Summary: {summary}") |
|
|
except Exception as e: |
|
|
st.write(f"Error summarizing text: {str(e)}") |
|
|
else: |
|
|
st.write("Text is too short to summarize.") |
|
|
else: |
|
|
st.write("No relevant text found to summarize.") |