MohammadYaseen commited on
Commit
4184e11
·
verified ·
1 Parent(s): dc988b3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -0
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import PyPDF2
4
+ import docx
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import streamlit as st
8
+ import time
9
+ from groq import Groq
10
+ import re
11
+
12
+ # Initialize embedding model
13
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
14
+
15
+ # FAISS setup
16
+ dimension = 384 # Dimension of 'all-MiniLM-L6-v2' embeddings
17
+ index = faiss.IndexFlatL2(dimension)
18
+ document_texts = [] # Store text corresponding to embeddings
19
+
20
+ # Constants for file handling
21
+ MAX_FILE_SIZE_MB = 100 # 100 MB
22
+ MAX_NUM_FILES = 5
23
+ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
24
+
25
+ # Set up the Groq API client directly with your API key
26
+ api_key = "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" # Replace with your actual Groq API key
27
+ client = Groq(api_key=api_key)
28
+
29
+ # Function to get human-readable file size
30
+ def get_human_readable_size(size_in_bytes):
31
+ if size_in_bytes < 1024:
32
+ return f"{size_in_bytes} Bytes"
33
+ elif size_in_bytes < 1024 ** 2:
34
+ return f"{size_in_bytes / 1024:.2f} KB"
35
+ elif size_in_bytes < 1024 ** 3:
36
+ return f"{size_in_bytes / (1024 ** 2):.2f} MB"
37
+ else:
38
+ return f"{size_in_bytes / (1024 ** 3):.2f} GB"
39
+
40
+ # Function to extract text from uploaded files
41
+ def extract_text_from_file(file):
42
+ text = ""
43
+ if file.name.endswith(".pdf"):
44
+ pdf_reader = PyPDF2.PdfReader(file)
45
+ for page in pdf_reader.pages:
46
+ text += page.extract_text()
47
+ elif file.name.endswith(".csv"):
48
+ df = pd.read_csv(file)
49
+ text = "\n".join([" ".join(map(str, row)) for row in df.values])
50
+ elif file.name.endswith(".xlsx") or file.name.endswith(".xls"):
51
+ df = pd.read_excel(file)
52
+ text = "\n".join([" ".join(map(str, row)) for row in df.values])
53
+ elif file.name.endswith(".txt"):
54
+ text = file.read().decode("utf-8")
55
+ elif file.name.endswith(".docx"):
56
+ doc = docx.Document(file)
57
+ text = "\n".join([p.text for p in doc.paragraphs])
58
+ else:
59
+ text = None
60
+ return text
61
+
62
+ # Function to split large text into smaller chunks
63
+ def split_text_into_chunks(text, max_chunk_size=500):
64
+ sentences = text.split(". ")
65
+ chunks = []
66
+ chunk = []
67
+ current_size = 0
68
+ for sentence in sentences:
69
+ sentence_size = len(sentence)
70
+ if current_size + sentence_size <= max_chunk_size:
71
+ chunk.append(sentence)
72
+ current_size += sentence_size
73
+ else:
74
+ chunks.append(". ".join(chunk))
75
+ chunk = [sentence]
76
+ current_size = sentence_size
77
+ if chunk:
78
+ chunks.append(". ".join(chunk))
79
+ return chunks
80
+
81
+ # Function to add document text to FAISS index
82
+ def add_to_index(text, index, document_texts):
83
+ chunks = split_text_into_chunks(text)
84
+ embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
85
+ index.add(embeddings)
86
+ document_texts.extend(chunks)
87
+
88
+ # Function to generate pre-questions based on the document
89
+ def suggest_questions(text):
90
+ # Example simple questions based on content type
91
+ if len(text.split()) < 200:
92
+ return [
93
+ "Can you summarize the main points?",
94
+ "What is the main argument or conclusion?",
95
+ "What is the purpose of this document?"
96
+ ]
97
+ else:
98
+ return [
99
+ "What are the key takeaways from this document?",
100
+ "Can you provide a summary of the main sections?",
101
+ "What are the major findings or conclusions?"
102
+ ]
103
+
104
+ # Function to generate answer using Groq
105
+ def generate_answer_with_groq(question, context):
106
+ # Sending user input question to Groq for response
107
+ chat_completion = client.chat.completions.create(
108
+ messages=[{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}],
109
+ model="gemma2-9b-it",
110
+ )
111
+ return chat_completion.choices[0].message.content
112
+
113
+ # Function to validate user input (basic check for valid text)
114
+ def is_valid_input(query):
115
+ # Check if the input contains only alphabetic characters, spaces, or common punctuation
116
+ # This heuristic helps detect typing errors or nonsensical queries
117
+ query = query.strip()
118
+ if not query:
119
+ return False # Empty input is invalid
120
+ # Regex to allow letters, spaces, and common punctuation
121
+ pattern = r"^[A-Za-z0-9\s.,!?'-]*$"
122
+ if re.match(pattern, query):
123
+ return True
124
+ return False
125
+
126
+ # Handling user feedback
127
+ def handle_feedback(feedback):
128
+ if feedback:
129
+ st.write("Thank you for your feedback!")
130
+
131
+ # Streamlit UI
132
+ st.title("Enhanced Document Q&A with RAG")
133
+ st.sidebar.title("Tips for Better Experience")
134
+ st.sidebar.write("""
135
+ 1. Maximum file size: 100 MB per file.
136
+ 2. You can upload up to 5 files at a time.
137
+ 3. Larger files may take longer to process.
138
+ 4. Please break large files into smaller chunks if necessary.
139
+ 5. Use the pre-generated questions to guide your inquiry.
140
+ """)
141
+
142
+ feedback = st.sidebar.text_area("Provide feedback to improve your experience:")
143
+
144
+ # File uploader
145
+ uploaded_files = st.file_uploader(
146
+ "Upload documents (PDF, CSV, Excel, TXT, DOCX). Max size: 100 MB each.",
147
+ type=["pdf", "csv", "xlsx", "xls", "txt", "docx"],
148
+ accept_multiple_files=True,
149
+ )
150
+
151
+ if uploaded_files:
152
+ if len(uploaded_files) > MAX_NUM_FILES:
153
+ st.error(f"Maximum {MAX_NUM_FILES} files can be uploaded at a time.")
154
+ else:
155
+ for file in uploaded_files:
156
+ file_size = file.size
157
+ human_readable_size = get_human_readable_size(file_size)
158
+ st.write(f"File: {file.name} | Size: {human_readable_size}")
159
+ if file_size > MAX_FILE_SIZE_BYTES:
160
+ st.warning(
161
+ f"File '{file.name}' exceeds the {MAX_FILE_SIZE_MB} MB limit. "
162
+ "We will automatically break this file into smaller chunks."
163
+ )
164
+ with st.spinner(f"Processing {file.name}..."):
165
+ text = extract_text_from_file(file)
166
+ if text:
167
+ # Automatically break large file into chunks
168
+ chunks = split_text_into_chunks(text)
169
+ add_to_index(" ".join(chunks), index, document_texts)
170
+ st.success(f"Processed {file.name}")
171
+ else:
172
+ st.error(f"Could not process {file.name}. Unsupported format.")
173
+ else:
174
+ st.warning("No documents uploaded yet. Please upload documents before asking questions.")
175
+
176
+ # Display user feedback handling
177
+ if feedback:
178
+ handle_feedback(feedback)
179
+
180
+ # Input for question
181
+ query = st.text_input("Enter your question:")
182
+
183
+ # If query is entered and documents are uploaded
184
+ if query:
185
+ if not document_texts:
186
+ st.warning("Please upload and process documents before asking questions.")
187
+ elif not is_valid_input(query):
188
+ st.error("Please ask a relevant question.")
189
+ else:
190
+ # Use Groq to generate a response based on uploaded documents
191
+ with st.spinner("Generating response..."):
192
+ response = generate_answer_with_groq(query, " ".join(document_texts))
193
+ st.write("### Answer:")
194
+ st.write(response)
195
+
196
+ st.write("### Suggested Questions:")
197
+ questions = suggest_questions(" ".join(document_texts)) # Generate based on full document content
198
+ for question in questions:
199
+ st.write(f"- {question}")
200
+
201
+ # Instructions and reminders if not uploaded_files:
202
+ if not uploaded_files:
203
+ st.info("You haven't uploaded any documents yet. Please upload documents to start.")
204
+ else:
205
+ st.info("Enter a question to ask about the uploaded documents.")