AzizWazir commited on
Commit
1a419f4
·
verified ·
1 Parent(s): 1b61846

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -20
app.py CHANGED
@@ -10,25 +10,25 @@ import docx
10
  import pandas as pd
11
 
12
  # Initialize the summarization and question-answering models from Hugging Face
13
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=80) # Concise summary settings
14
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Better QA model for concise answers
15
 
16
  # Sentence Transformer for embedding-based retrieval
17
- embedder = SentenceTransformer('all-MiniLM-L6-v2') # A compact and efficient embedding model
18
 
19
  # FAISS Indexing Function with better embedding-based chunking
20
  def create_faiss_index(text):
21
- paragraphs = text.split('\n\n') # Assuming paragraphs are separated by double newlines
22
  embeddings = embedder.encode(paragraphs) # Use sentence-transformers to create embeddings
23
- index = faiss.IndexFlatL2(384) # We use 384-dimensional embeddings (from the chosen model)
24
- index.add(np.array(embeddings).astype(np.float32)) # Add the embeddings to the FAISS index
25
  return index, paragraphs
26
 
27
  # Function to retrieve the most relevant chunk from the index
28
  def retrieve_relevant_chunk(query, index, paragraphs):
29
- query_embedding = embedder.encode([query]) # Use sentence transformer to embed the query
30
- D, I = index.search(np.array(query_embedding).astype(np.float32), 1) # Search for the closest chunk
31
- return paragraphs[I[0][0]] # Return the most relevant paragraph
32
 
33
  # Streamlit UI
34
  st.title("Concise Summarizer and Q&A")
@@ -37,7 +37,7 @@ st.title("Concise Summarizer and Q&A")
37
  uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
38
 
39
  if uploaded_file:
40
- file_type = uploaded_file.type # Get the MIME type of the uploaded file
41
 
42
  # Extract text based on file type
43
  if file_type == "application/pdf":
@@ -51,7 +51,7 @@ if uploaded_file:
51
  for para in doc.paragraphs:
52
  text += para.text + "\n"
53
  elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
54
- df = pd.read_excel(uploaded_file, engine="openpyxl") # Use openpyxl to read .xlsx files
55
  text = ""
56
  for col in df.columns:
57
  text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
@@ -60,11 +60,11 @@ if uploaded_file:
60
  text = ""
61
 
62
  if text:
63
- # Show extracted text (first 500 characters)
64
  st.write("Text extracted from file:")
65
- st.write(text[:500]) # Show first 500 characters of extracted text
66
 
67
- # Create FAISS index from the extracted text
68
  index, paragraphs = create_faiss_index(text)
69
 
70
  # Input for user query
@@ -77,19 +77,20 @@ if uploaded_file:
77
 
78
  # Answer the question based on the relevant chunk
79
  st.write("Answering the question...")
80
- answer = qa_pipeline(question=query, context=relevant_chunk) # Use question-answering pipeline
81
- st.write(f"Answer: {answer['answer']}")
 
82
 
83
- # Summarize the relevant chunk (display summary after answering the query)
84
  if relevant_chunk.strip():
85
- if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
86
  try:
87
  st.write("Summarizing...")
88
- summary = summarizer(relevant_chunk, max_length=80, min_length=50, do_sample=False)[0]['summary_text']
89
  st.write(f"Summary: {summary}")
90
  except Exception as e:
91
  st.write(f"Error summarizing text: {str(e)}")
92
  else:
93
- st.write("Text is too short to summarize effectively.")
94
  else:
95
  st.write("No relevant text found to summarize.")
 
10
  import pandas as pd
11
 
12
  # Initialize the summarization and question-answering models from Hugging Face
13
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", max_length=50, min_length=30, do_sample=False) # Concise summary settings
14
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # More focused QA model
15
 
16
  # Sentence Transformer for embedding-based retrieval
17
+ embedder = SentenceTransformer('all-MiniLM-L6-v2') # Efficient and compact embeddings
18
 
19
  # FAISS Indexing Function with better embedding-based chunking
20
  def create_faiss_index(text):
21
+ paragraphs = text.split('\n\n') # Split into paragraphs by double newlines
22
  embeddings = embedder.encode(paragraphs) # Use sentence-transformers to create embeddings
23
+ index = faiss.IndexFlatL2(384) # Using 384-dimensional embeddings
24
+ index.add(np.array(embeddings).astype(np.float32)) # Add embeddings to the FAISS index
25
  return index, paragraphs
26
 
27
  # Function to retrieve the most relevant chunk from the index
28
  def retrieve_relevant_chunk(query, index, paragraphs):
29
+ query_embedding = embedder.encode([query]) # Get embedding for the query
30
+ D, I = index.search(np.array(query_embedding).astype(np.float32), 1) # Search for closest match
31
+ return paragraphs[I[0][0]] # Return the relevant paragraph
32
 
33
  # Streamlit UI
34
  st.title("Concise Summarizer and Q&A")
 
37
  uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
38
 
39
  if uploaded_file:
40
+ file_type = uploaded_file.type # Get MIME type
41
 
42
  # Extract text based on file type
43
  if file_type == "application/pdf":
 
51
  for para in doc.paragraphs:
52
  text += para.text + "\n"
53
  elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
54
+ df = pd.read_excel(uploaded_file, engine="openpyxl")
55
  text = ""
56
  for col in df.columns:
57
  text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
 
60
  text = ""
61
 
62
  if text:
63
+ # Display the first 500 characters of extracted text
64
  st.write("Text extracted from file:")
65
+ st.write(text[:500]) # Show first 500 characters
66
 
67
+ # Create FAISS index
68
  index, paragraphs = create_faiss_index(text)
69
 
70
  # Input for user query
 
77
 
78
  # Answer the question based on the relevant chunk
79
  st.write("Answering the question...")
80
+ answer = qa_pipeline(question=query, context=relevant_chunk)
81
+ concise_answer = answer['answer'][:300] # Limiting answer to 300 characters (short answer)
82
+ st.write(f"Answer: {concise_answer}")
83
 
84
+ # Summarize the relevant chunk (concise summary after query answer)
85
  if relevant_chunk.strip():
86
+ if len(relevant_chunk.split()) > 20: # Only summarize if text is long enough
87
  try:
88
  st.write("Summarizing...")
89
+ summary = summarizer(relevant_chunk, max_length=50, min_length=30, do_sample=False)[0]['summary_text']
90
  st.write(f"Summary: {summary}")
91
  except Exception as e:
92
  st.write(f"Error summarizing text: {str(e)}")
93
  else:
94
+ st.write("Text is too short to summarize.")
95
  else:
96
  st.write("No relevant text found to summarize.")