Spaces:

AzizWazir
/

Free-Summarizer-Excel-Word-PDF

Sleeping

App Files Files Community

Free-Summarizer-Excel-Word-PDF / app.py

AzizWazir

Update app.py

2380f79 verified about 1 year ago

raw

history blame contribute delete

4.56 kB

	import streamlit as st
	import fitz # PyMuPDF for PDF extraction
	import faiss
	import numpy as np
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer
	import requests
	from io import BytesIO
	import docx
	import pandas as pd

	# Initialize the summarization and question-answering models from Hugging Face
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

	# Sentence Transformer for embedding-based retrieval
	embedder = SentenceTransformer('all-MiniLM-L6-v2')

	# FAISS Indexing Function with sentence-level retrieval
	def create_faiss_index(text):
	sentences = text.split(". ") # Split into sentences using "." as delimiter
	embeddings = embedder.encode(sentences)
	index = faiss.IndexFlatL2(384)
	index.add(np.array(embeddings).astype(np.float32))
	return index, sentences

	# Function to retrieve the most relevant sentences
	def retrieve_relevant_sentences(query, index, sentences):
	query_embedding = embedder.encode([query])
	D, I = index.search(np.array(query_embedding).astype(np.float32), 5) # Retrieve top 5 most similar sentences
	relevant_sentences = [sentences[i] for i in I[0]]
	return relevant_sentences

	# Function to filter retrieved sentences based on keywords
	def filter_sentences(query, sentences):
	filtered_sentences = []
	for sentence in sentences:
	if any(word.lower() in sentence.lower() for word in query.split()):
	filtered_sentences.append(sentence)
	return filtered_sentences

	# Streamlit UI
	st.title("Concise Summarizer and Q&A")

	# Upload File
	uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])

	if uploaded_file:
	file_type = uploaded_file.type

	# Extract text based on file type
	if file_type == "application/pdf":
	doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	doc = docx.Document(BytesIO(uploaded_file.read()))
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
	df = pd.read_excel(uploaded_file, engine="openpyxl")
	text = ""
	for col in df.columns:
	text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
	else:
	st.error("Unsupported file type!")
	text = ""

	if text:
	# Display the first 500 characters of extracted text
	st.write("Text extracted from file:")
	st.write(text[:500]) # Show first 500 characters

	# Create FAISS index
	index, sentences = create_faiss_index(text)

	# Input for user query
	query = st.text_input("Enter your query:")

	if query:
	st.write("Retrieving relevant information...")
	relevant_sentences = retrieve_relevant_sentences(query, index, sentences)
	filtered_sentences = filter_sentences(query, relevant_sentences)

	# Combine filtered sentences into a single string
	relevant_text = " ".join(filtered_sentences)

	st.write(f"Relevant Text: {relevant_text}")

	# Answer the question based on the relevant chunk
	st.write("Answering the question...")
	try:
	answer = qa_pipeline(question=query, context=relevant_text)
	concise_answer = answer['answer']
	st.write(f"Answer: {concise_answer}")
	except Exception as e:
	st.write(f"Error answering question: {str(e)}")

	# Summarize the relevant chunk (concise summary after query answer)
	if relevant_text.strip():
	if len(relevant_text.split()) > 20: # Only summarize if text is long enough
	try:
	st.write("Summarizing...")
	summary = summarizer(relevant_text, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
	st.write(f"Summary: {summary}")
	except Exception as e:
	st.write(f"Error summarizing text: {str(e)}")
	else:
	st.write("Text is too short to summarize.")
	else:
	st.write("No relevant text found to summarize.")