Update app.py
Browse files
app.py
CHANGED
|
@@ -10,8 +10,8 @@ import docx
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
# Initialize the summarization and question-answering models from Hugging Face
|
| 13 |
-
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 14 |
-
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Better QA model
|
| 15 |
|
| 16 |
# Sentence Transformer for embedding-based retrieval
|
| 17 |
embedder = SentenceTransformer('all-MiniLM-L6-v2') # A compact and efficient embedding model
|
|
@@ -22,7 +22,6 @@ API_URL = "https://api.groq.com/openai/v1/chat/completions" # Default endpoint
|
|
| 22 |
|
| 23 |
# PDF Processing Function
|
| 24 |
def extract_text_from_pdf(pdf_file):
|
| 25 |
-
# Read the file from the uploaded BytesIO object
|
| 26 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
| 27 |
text = ""
|
| 28 |
for page in doc:
|
|
@@ -47,11 +46,8 @@ def extract_text_from_excel(excel_file):
|
|
| 47 |
|
| 48 |
# FAISS Indexing Function with better embedding-based chunking
|
| 49 |
def create_faiss_index(text):
|
| 50 |
-
# Split the text into paragraphs or logical sections for better context
|
| 51 |
paragraphs = text.split('\n\n') # Assuming paragraphs are separated by double newlines
|
| 52 |
embeddings = embedder.encode(paragraphs) # Use sentence-transformers to create embeddings
|
| 53 |
-
|
| 54 |
-
# Create a FAISS index
|
| 55 |
index = faiss.IndexFlatL2(384) # We use 384-dimensional embeddings (from the chosen model)
|
| 56 |
index.add(np.array(embeddings).astype(np.float32)) # Add the embeddings to the FAISS index
|
| 57 |
return index, paragraphs
|
|
@@ -123,13 +119,12 @@ if uploaded_file:
|
|
| 123 |
answer = qa_pipeline(question=query, context=relevant_chunk) # Use question-answering pipeline
|
| 124 |
st.write(f"Answer: {answer['answer']}")
|
| 125 |
|
| 126 |
-
# Summarize the relevant chunk
|
| 127 |
if relevant_chunk.strip():
|
| 128 |
-
# Ensure it's long enough for summarization (avoid too short text)
|
| 129 |
if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
|
| 130 |
try:
|
| 131 |
st.write("Summarizing...")
|
| 132 |
-
summary = summarizer(relevant_chunk, max_length=
|
| 133 |
st.write(f"Summary: {summary}")
|
| 134 |
except Exception as e:
|
| 135 |
st.write(f"Error summarizing text: {str(e)}")
|
|
|
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
# Initialize the summarization and question-answering models from Hugging Face
|
| 13 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=100) # Concise summary settings
|
| 14 |
+
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Better QA model for concise answers
|
| 15 |
|
| 16 |
# Sentence Transformer for embedding-based retrieval
|
| 17 |
embedder = SentenceTransformer('all-MiniLM-L6-v2') # A compact and efficient embedding model
|
|
|
|
| 22 |
|
| 23 |
# PDF Processing Function
|
| 24 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
| 25 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
| 26 |
text = ""
|
| 27 |
for page in doc:
|
|
|
|
| 46 |
|
| 47 |
# FAISS Indexing Function with better embedding-based chunking
|
| 48 |
def create_faiss_index(text):
|
|
|
|
| 49 |
paragraphs = text.split('\n\n') # Assuming paragraphs are separated by double newlines
|
| 50 |
embeddings = embedder.encode(paragraphs) # Use sentence-transformers to create embeddings
|
|
|
|
|
|
|
| 51 |
index = faiss.IndexFlatL2(384) # We use 384-dimensional embeddings (from the chosen model)
|
| 52 |
index.add(np.array(embeddings).astype(np.float32)) # Add the embeddings to the FAISS index
|
| 53 |
return index, paragraphs
|
|
|
|
| 119 |
answer = qa_pipeline(question=query, context=relevant_chunk) # Use question-answering pipeline
|
| 120 |
st.write(f"Answer: {answer['answer']}")
|
| 121 |
|
| 122 |
+
# Summarize the relevant chunk (display summary after answering the query)
|
| 123 |
if relevant_chunk.strip():
|
|
|
|
| 124 |
if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
|
| 125 |
try:
|
| 126 |
st.write("Summarizing...")
|
| 127 |
+
summary = summarizer(relevant_chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
|
| 128 |
st.write(f"Summary: {summary}")
|
| 129 |
except Exception as e:
|
| 130 |
st.write(f"Error summarizing text: {str(e)}")
|