File size: 4,559 Bytes
2ce24b4
 
 
 
5492263
1f0bfc6
2ce24b4
2558e6a
 
 
2ce24b4
364de8d
d9e1c6d
 
1f0bfc6
 
d9e1c6d
2ce24b4
d9e1c6d
2ce24b4
d9e1c6d
 
 
 
 
2ce24b4
d9e1c6d
 
 
 
 
 
 
 
 
 
 
 
 
 
2ce24b4
 
1b61846
61ee9be
2558e6a
 
 
 
d9e1c6d
2558e6a
 
 
1b61846
 
 
 
2558e6a
1b61846
 
 
 
2558e6a
1a419f4
1b61846
 
 
2558e6a
 
 
 
 
1a419f4
2380f79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
import fitz  # PyMuPDF for PDF extraction
import faiss
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import requests
from io import BytesIO
import docx
import pandas as pd

# Initialize the summarization and question-answering models from Hugging Face
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Sentence Transformer for embedding-based retrieval
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# FAISS Indexing Function with sentence-level retrieval
def create_faiss_index(text):
    sentences = text.split(". ")  # Split into sentences using "." as delimiter
    embeddings = embedder.encode(sentences)
    index = faiss.IndexFlatL2(384)
    index.add(np.array(embeddings).astype(np.float32))
    return index, sentences

# Function to retrieve the most relevant sentences
def retrieve_relevant_sentences(query, index, sentences):
    query_embedding = embedder.encode([query])
    D, I = index.search(np.array(query_embedding).astype(np.float32), 5)  # Retrieve top 5 most similar sentences
    relevant_sentences = [sentences[i] for i in I[0]]
    return relevant_sentences

# Function to filter retrieved sentences based on keywords
def filter_sentences(query, sentences):
    filtered_sentences = []
    for sentence in sentences:
        if any(word.lower() in sentence.lower() for word in query.split()):
            filtered_sentences.append(sentence)
    return filtered_sentences

# Streamlit UI
st.title("Concise Summarizer and Q&A")

# Upload File
uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])

if uploaded_file:
    file_type = uploaded_file.type

    # Extract text based on file type
    if file_type == "application/pdf":
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = docx.Document(BytesIO(uploaded_file.read()))
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
        df = pd.read_excel(uploaded_file, engine="openpyxl")
        text = ""
        for col in df.columns:
            text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
    else:
        st.error("Unsupported file type!")
        text = ""

    if text:
        # Display the first 500 characters of extracted text
        st.write("Text extracted from file:")
        st.write(text[:500])  # Show first 500 characters

        # Create FAISS index
        index, sentences = create_faiss_index(text)

        # Input for user query
        query = st.text_input("Enter your query:")

        if query:
            st.write("Retrieving relevant information...")
            relevant_sentences = retrieve_relevant_sentences(query, index, sentences)
            filtered_sentences = filter_sentences(query, relevant_sentences)

            # Combine filtered sentences into a single string
            relevant_text = " ".join(filtered_sentences)

            st.write(f"Relevant Text: {relevant_text}")

            # Answer the question based on the relevant chunk
            st.write("Answering the question...")
            try:
                answer = qa_pipeline(question=query, context=relevant_text)
                concise_answer = answer['answer']
                st.write(f"Answer: {concise_answer}")
            except Exception as e:
                st.write(f"Error answering question: {str(e)}")

            # Summarize the relevant chunk (concise summary after query answer)
            if relevant_text.strip():
                if len(relevant_text.split()) > 20:  # Only summarize if text is long enough
                    try:
                        st.write("Summarizing...")
                        summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
                        st.write(f"Summary: {summary}")
                    except Exception as e:
                        st.write(f"Error summarizing text: {str(e)}")
                else:
                    st.write("Text is too short to summarize.")
            else:
                st.write("No relevant text found to summarize.")