gmustafa413 commited on
Commit
999a7b7
·
verified ·
1 Parent(s): 83833a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -1
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import fitz
3
  import numpy as np
@@ -11,6 +14,242 @@ from pptx import Presentation
11
  from sentence_transformers import SentenceTransformer
12
  from concurrent.futures import ThreadPoolExecutor
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Configuration
15
  GROQ_API_KEY = "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"
16
  MODEL_NAME = "all-MiniLM-L6-v2"
@@ -187,4 +426,4 @@ with gr.Blocks(title="RAG System", css=".footer {display: none !important}") as
187
  clear_btn.click(lambda: [], None, chatbot)
188
 
189
  app.launch()
190
-
 
1
+ # Install dependencies
2
+ !pip install -q gradio pymupdf python-docx pandas python-pptx sentence-transformers faiss-cpu requests
3
+
4
  import gradio as gr
5
  import fitz
6
  import numpy as np
 
14
  from sentence_transformers import SentenceTransformer
15
  from concurrent.futures import ThreadPoolExecutor
16
 
17
+ # Configuration
18
+ GROQ_API_KEY = "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" # 🔑 REPLACE WITH YOUR ACTUAL KEY
19
+ MODEL_NAME = "all-MiniLM-L6-v2"
20
+ CHUNK_SIZE = 512
21
+ MAX_TOKENS = 4096
22
+ MODEL = SentenceTransformer(MODEL_NAME)
23
+ WORKERS = 8
24
+
25
+ class DocumentProcessor:
26
+ def __init__(self):
27
+ self.index = faiss.IndexFlatIP(MODEL.get_sentence_embedding_dimension())
28
+ self.chunks = []
29
+ self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
30
+
31
+ def extract_text_from_pptx(self, file_path):
32
+ try:
33
+ prs = Presentation(file_path)
34
+ return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
35
+ except Exception as e:
36
+ print(f"PPTX Error: {str(e)}")
37
+ return ""
38
+
39
+ def extract_text_from_xls_csv(self, file_path):
40
+ try:
41
+ if file_path.endswith(('.xls', '.xlsx')):
42
+ df = pd.read_excel(file_path)
43
+ else:
44
+ df = pd.read_csv(file_path)
45
+ return " ".join(df.astype(str).values.flatten())
46
+ except Exception as e:
47
+ print(f"Spreadsheet Error: {str(e)}")
48
+ return ""
49
+
50
+ def extract_text_from_pdf(self, file_path):
51
+ try:
52
+ doc = fitz.open(file_path)
53
+ return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
54
+ except Exception as e:
55
+ print(f"PDF Error: {str(e)}")
56
+ return ""
57
+
58
+ def process_file(self, file):
59
+ try:
60
+ file_path = file.name
61
+ print(f"Processing: {file_path}") # Debug print
62
+
63
+ if file_path.endswith('.pdf'):
64
+ text = self.extract_text_from_pdf(file_path)
65
+ elif file_path.endswith('.docx'):
66
+ text = " ".join(p.text for p in Document(file_path).paragraphs)
67
+ elif file_path.endswith('.txt'):
68
+ with open(file_path, 'r', encoding='utf-8') as f:
69
+ text = f.read()
70
+ elif file_path.endswith('.pptx'):
71
+ text = self.extract_text_from_pptx(file_path)
72
+ elif file_path.endswith(('.xls', '.xlsx', '.csv')):
73
+ text = self.extract_text_from_xls_csv(file_path)
74
+ else:
75
+ return ""
76
+
77
+ clean_text = re.sub(r'\s+', ' ', text).strip()
78
+ print(f"Extracted {len(clean_text)} characters from {file_path}") # Debug
79
+ return clean_text
80
+ except Exception as e:
81
+ print(f"Processing Error: {str(e)}") # Debug
82
+ return ""
83
+
84
+ def semantic_chunking(self, text):
85
+ words = re.findall(r'\S+\s*', text)
86
+ chunks = [''.join(words[i:i+CHUNK_SIZE//2]) for i in range(0, len(words), CHUNK_SIZE//2)]
87
+ return chunks[:1000] # Limit to 1000 chunks per document
88
+
89
+ def process_documents(self, files):
90
+ self.chunks = []
91
+ if not files:
92
+ return "No files uploaded!"
93
+
94
+ print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
95
+ texts = list(self.processor_pool.map(self.process_file, files))
96
+
97
+ with ThreadPoolExecutor(max_workers=WORKERS) as executor:
98
+ chunk_lists = list(executor.map(self.semantic_chunking, texts))
99
+
100
+ all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
101
+ print(f"Total chunks generated: {len(all_chunks)}") # Debug
102
+
103
+ if not all_chunks:
104
+ return "Error: No chunks generated from documents"
105
+
106
+ try:
107
+ embeddings = MODEL.encode(
108
+ all_chunks,
109
+ batch_size=512,
110
+ convert_to_tensor=True,
111
+ show_progress_bar=False
112
+ ).cpu().numpy().astype('float32')
113
+
114
+ self.index.reset()
115
+ self.index.add(embeddings)
116
+ self.chunks = all_chunks
117
+ return f"✅ Processed {len(all_chunks)} chunks from {len(files)} files"
118
+ except Exception as e:
119
+ print(f"Embedding Error: {str(e)}")
120
+ return f"Error: {str(e)}"
121
+
122
+ def query(self, question):
123
+ if not self.chunks:
124
+ return "Please process documents first", False
125
+
126
+ try:
127
+ print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
128
+ print(f"Question: {question}")
129
+
130
+ # Generate embedding for the question
131
+ question_embedding = MODEL.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
132
+
133
+ # Search FAISS index
134
+ _, indices = self.index.search(question_embedding, 3)
135
+ print(f"Top indices: {indices}")
136
+
137
+ # Get context from top chunks
138
+ context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
139
+ print(f"Context length: {len(context)} characters")
140
+
141
+ # API Call with error handling
142
+ headers = {
143
+ "Authorization": f"Bearer {GROQ_API_KEY}",
144
+ "Content-Type": "application/json"
145
+ }
146
+
147
+ payload = {
148
+ "messages": [{
149
+ "role": "user",
150
+ "content": f"Answer concisely: {question}\nContext: {context}"
151
+ }],
152
+ "model": "mixtral-8x7b-32768",
153
+ "temperature": 0.3,
154
+ "max_tokens": MAX_TOKENS,
155
+ "stream": True
156
+ }
157
+
158
+ response = requests.post(
159
+ "https://api.groq.com/openai/v1/chat/completions",
160
+ headers=headers,
161
+ json=payload,
162
+ timeout=20
163
+ )
164
+
165
+ print(f"API Status Code: {response.status_code}") # Debug
166
+
167
+ if response.status_code != 200:
168
+ return f"API Error: {response.text}", False
169
+
170
+ full_answer = []
171
+ for chunk in response.iter_lines():
172
+ if chunk:
173
+ try:
174
+ decoded = chunk.decode('utf-8').strip()
175
+ if decoded.startswith('data:'):
176
+ data = json.loads(decoded[5:])
177
+ if content := data.get('choices', [{}])[0].get('delta', {}).get('content', ''):
178
+ full_answer.append(content)
179
+ except Exception as e:
180
+ print(f"Chunk Error: {str(e)}")
181
+ continue
182
+
183
+ final_answer = ''.join(full_answer)
184
+ print(f"Final Answer: {final_answer}") # Debug
185
+ return final_answer, True
186
+
187
+ except Exception as e:
188
+ print(f"Query Error: {str(e)}") # Debug
189
+ return f"Error: {str(e)}", False
190
+
191
+ # Initialize processor
192
+ processor = DocumentProcessor()
193
+
194
+ # Gradio interface with improved error handling
195
+ def ask_question(question, chat_history):
196
+ if not question.strip():
197
+ return chat_history + [("", "Please enter a valid question")]
198
+
199
+ answer, success = processor.query(question)
200
+ return chat_history + [(question, answer)]
201
+
202
+ with gr.Blocks(title="RAG System") as app:
203
+ gr.Markdown("## 🚀 Multi-Format RAG System")
204
+ with gr.Row():
205
+ files = gr.File(file_count="multiple",
206
+ file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
207
+ label="Upload Documents")
208
+ process_btn = gr.Button("Process", variant="primary")
209
+ status = gr.Textbox(label="Processing Status", interactive=False)
210
+ chatbot = gr.Chatbot(height=500, label="Chat History")
211
+ with gr.Row():
212
+ question = gr.Textbox(label="Your Query",
213
+ placeholder="Enter your question...",
214
+ max_lines=3)
215
+ ask_btn = gr.Button("Ask", variant="primary")
216
+ clear_btn = gr.Button("Clear Chat")
217
+
218
+ process_btn.click(
219
+ fn=processor.process_documents,
220
+ inputs=files,
221
+ outputs=status
222
+ )
223
+
224
+ ask_btn.click(
225
+ fn=ask_question,
226
+ inputs=[question, chatbot],
227
+ outputs=chatbot
228
+ ).then(lambda: "", None, question) # Clear input after submission
229
+
230
+ clear_btn.click(
231
+ fn=lambda: [],
232
+ inputs=None,
233
+ outputs=chatbot
234
+ )
235
+
236
+ app.launch(share=True, debug=True)
237
+
238
+ #3000000000000000000000000000000000000000000000000000000000
239
+
240
+ '''import gradio as gr
241
+ import fitz
242
+ import numpy as np
243
+ import requests
244
+ import faiss
245
+ import re
246
+ import json
247
+ import pandas as pd
248
+ from docx import Document
249
+ from pptx import Presentation
250
+ from sentence_transformers import SentenceTransformer
251
+ from concurrent.futures import ThreadPoolExecutor
252
+
253
  # Configuration
254
  GROQ_API_KEY = "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"
255
  MODEL_NAME = "all-MiniLM-L6-v2"
 
426
  clear_btn.click(lambda: [], None, chatbot)
427
 
428
  app.launch()
429
+ '''