AzizWazir commited on
Commit
2558e6a
·
verified ·
1 Parent(s): 3645bc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -44
app.py CHANGED
@@ -5,6 +5,9 @@ import numpy as np
5
  from transformers import pipeline
6
  from sentence_transformers import SentenceTransformer
7
  import requests
 
 
 
8
 
9
  # Initialize the summarization and question-answering models from Hugging Face
10
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
@@ -26,6 +29,22 @@ def extract_text_from_pdf(pdf_file):
26
  text += page.get_text()
27
  return text
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # FAISS Indexing Function with better embedding-based chunking
30
  def create_faiss_index(text):
31
  # Split the text into paragraphs or logical sections for better context
@@ -64,48 +83,57 @@ def call_groq_api(input_text):
64
  return f"Error: {str(e)}"
65
 
66
  # Streamlit UI
67
- st.title("RAG-based PDF Summarizer and Q&A")
68
-
69
- # Upload PDF file
70
- pdf_file = st.file_uploader("Upload a PDF file", type="pdf")
71
-
72
- if pdf_file:
73
- # Extract text from the uploaded PDF
74
- text = extract_text_from_pdf(pdf_file)
75
- st.write("Text extracted from PDF:")
76
- st.write(text[:500]) # Show first 500 characters of extracted text
77
-
78
- # Create FAISS index from the extracted text
79
- index, paragraphs = create_faiss_index(text)
80
-
81
- # Input for user query
82
- query = st.text_input("Enter your query:")
83
-
84
- if query:
85
- st.write("Retrieving relevant information...")
86
- relevant_chunk = retrieve_relevant_chunk(query, index, paragraphs)
87
- st.write(f"Relevant Text: {relevant_chunk}")
88
-
89
- # Answer the question based on the relevant chunk
90
- st.write("Answering the question...")
91
- answer = qa_pipeline(question=query, context=relevant_chunk) # Use question-answering pipeline
92
- st.write(f"Answer: {answer['answer']}")
93
-
94
- # Summarize the relevant chunk, but check if it's empty or too short
95
- if relevant_chunk.strip():
96
- # Ensure it's long enough for summarization (avoid too short text)
97
- if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
98
- try:
99
- st.write("Summarizing...")
100
- summary = summarizer(relevant_chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
101
- st.write(f"Summary: {summary}")
102
- except Exception as e:
103
- st.write(f"Error summarizing text: {str(e)}")
104
- else:
105
- st.write("Text is too short to summarize effectively.")
106
- else:
107
- st.write("No relevant text found to summarize.")
108
 
109
- # Optionally, summarize using Groq API
110
- # summary = call_groq_api(relevant_chunk)
111
- # st.write(f"Groq Summary: {summary}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from transformers import pipeline
6
  from sentence_transformers import SentenceTransformer
7
  import requests
8
+ from io import BytesIO
9
+ import docx
10
+ import pandas as pd
11
 
12
  # Initialize the summarization and question-answering models from Hugging Face
13
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
29
  text += page.get_text()
30
  return text
31
 
32
+ # MS Word Processing Function
33
+ def extract_text_from_word(word_file):
34
+ doc = docx.Document(BytesIO(word_file.read()))
35
+ text = ""
36
+ for para in doc.paragraphs:
37
+ text += para.text + "\n"
38
+ return text
39
+
40
+ # Excel File Processing Function
41
+ def extract_text_from_excel(excel_file):
42
+ df = pd.read_excel(excel_file, engine="openpyxl") # Use openpyxl to read .xlsx files
43
+ text = ""
44
+ for col in df.columns:
45
+ text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n" # Join values in each column
46
+ return text
47
+
48
  # FAISS Indexing Function with better embedding-based chunking
49
  def create_faiss_index(text):
50
  # Split the text into paragraphs or logical sections for better context
 
83
  return f"Error: {str(e)}"
84
 
85
  # Streamlit UI
86
+ st.title("RAG-based PDF, Word, Excel Summarizer and Q&A")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Upload File
89
+ uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
90
+
91
+ if uploaded_file:
92
+ file_type = uploaded_file.type # Get the MIME type of the uploaded file
93
+
94
+ # Extract text based on file type
95
+ if file_type == "application/pdf":
96
+ text = extract_text_from_pdf(uploaded_file)
97
+ elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
98
+ text = extract_text_from_word(uploaded_file)
99
+ elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
100
+ text = extract_text_from_excel(uploaded_file)
101
+ else:
102
+ st.error("Unsupported file type!")
103
+ text = ""
104
+
105
+ if text:
106
+ # Show extracted text (first 500 characters)
107
+ st.write("Text extracted from file:")
108
+ st.write(text[:500]) # Show first 500 characters of extracted text
109
+
110
+ # Create FAISS index from the extracted text
111
+ index, paragraphs = create_faiss_index(text)
112
+
113
+ # Input for user query
114
+ query = st.text_input("Enter your query:")
115
+
116
+ if query:
117
+ st.write("Retrieving relevant information...")
118
+ relevant_chunk = retrieve_relevant_chunk(query, index, paragraphs)
119
+ st.write(f"Relevant Text: {relevant_chunk}")
120
+
121
+ # Answer the question based on the relevant chunk
122
+ st.write("Answering the question...")
123
+ answer = qa_pipeline(question=query, context=relevant_chunk) # Use question-answering pipeline
124
+ st.write(f"Answer: {answer['answer']}")
125
+
126
+ # Summarize the relevant chunk
127
+ if relevant_chunk.strip():
128
+ # Ensure it's long enough for summarization (avoid too short text)
129
+ if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
130
+ try:
131
+ st.write("Summarizing...")
132
+ summary = summarizer(relevant_chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
133
+ st.write(f"Summary: {summary}")
134
+ except Exception as e:
135
+ st.write(f"Error summarizing text: {str(e)}")
136
+ else:
137
+ st.write("Text is too short to summarize effectively.")
138
+ else:
139
+ st.write("No relevant text found to summarize.")