gmustafa413 commited on
Commit
f544dea
·
verified ·
1 Parent(s): 298b385

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -193
app.py CHANGED
@@ -200,7 +200,7 @@ def ask_question(question, chat_history):
200
  return chat_history + [(question, answer)]
201
 
202
  with gr.Blocks(title="RAG System") as app:
203
- gr.Markdown("## 🚀 Multi-Format RAG System")
204
  with gr.Row():
205
  files = gr.File(file_count="multiple",
206
  file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
@@ -235,195 +235,3 @@ with gr.Blocks(title="RAG System") as app:
235
 
236
  app.launch(share=True, debug=True)
237
 
238
- #3000000000000000000000000000000000000000000000000000000000
239
-
240
- '''import gradio as gr
241
- import fitz
242
- import numpy as np
243
- import requests
244
- import faiss
245
- import re
246
- import json
247
- import pandas as pd
248
- from docx import Document
249
- from pptx import Presentation
250
- from sentence_transformers import SentenceTransformer
251
- from concurrent.futures import ThreadPoolExecutor
252
-
253
- # Configuration
254
- GROQ_API_KEY = "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"
255
- MODEL_NAME = "all-MiniLM-L6-v2"
256
- CHUNK_SIZE = 512
257
- MAX_TOKENS = 4096
258
- MODEL = SentenceTransformer(MODEL_NAME)
259
- WORKERS = 8
260
-
261
- class DocumentProcessor:
262
- def __init__(self):
263
- self.index = faiss.IndexFlatIP(MODEL.get_sentence_embedding_dimension())
264
- self.chunks = []
265
- self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
266
-
267
- def extract_text_from_pptx(self, file_path):
268
- try:
269
- prs = Presentation(file_path)
270
- return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
271
- except Exception as e:
272
- print(f"PPTX Error: {str(e)}")
273
- return ""
274
-
275
- def extract_text_from_xls_csv(self, file_path):
276
- try:
277
- if file_path.endswith(('.xls', '.xlsx')):
278
- df = pd.read_excel(file_path)
279
- else:
280
- df = pd.read_csv(file_path)
281
- return " ".join(df.astype(str).values.flatten())
282
- except Exception as e:
283
- print(f"Spreadsheet Error: {str(e)}")
284
- return ""
285
-
286
- def extract_text_from_pdf(self, file_path):
287
- try:
288
- doc = fitz.open(file_path)
289
- return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
290
- except Exception as e:
291
- print(f"PDF Error: {str(e)}")
292
- return ""
293
-
294
- def process_file(self, file):
295
- try:
296
- file_path = file.name
297
- if file_path.endswith('.pdf'):
298
- text = self.extract_text_from_pdf(file_path)
299
- elif file_path.endswith('.docx'):
300
- text = " ".join(p.text for p in Document(file_path).paragraphs)
301
- elif file_path.endswith('.txt'):
302
- with open(file_path, 'r', encoding='utf-8') as f:
303
- text = f.read()
304
- elif file_path.endswith('.pptx'):
305
- text = self.extract_text_from_pptx(file_path)
306
- elif file_path.endswith(('.xls', '.xlsx', '.csv')):
307
- text = self.extract_text_from_xls_csv(file_path)
308
- else:
309
- return ""
310
- return re.sub(r'\s+', ' ', text).strip()
311
- except Exception as e:
312
- print(f"Processing Error: {str(e)}")
313
- return ""
314
-
315
- def semantic_chunking(self, text):
316
- words = re.findall(r'\S+\s*', text)
317
- chunks = [''.join(words[i:i+CHUNK_SIZE//2]) for i in range(0, len(words), CHUNK_SIZE//2)]
318
- return chunks[:1000]
319
-
320
- def process_documents(self, files):
321
- self.chunks = []
322
- if not files:
323
- return "No files uploaded!"
324
-
325
- texts = list(self.processor_pool.map(self.process_file, files))
326
- with ThreadPoolExecutor(max_workers=WORKERS) as executor:
327
- chunk_lists = list(executor.map(self.semantic_chunking, texts))
328
-
329
- all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
330
- if not all_chunks:
331
- return "Error: No chunks generated from documents"
332
-
333
- try:
334
- embeddings = MODEL.encode(
335
- all_chunks,
336
- batch_size=512,
337
- convert_to_tensor=True,
338
- show_progress_bar=False
339
- ).cpu().numpy().astype('float32')
340
-
341
- self.index.reset()
342
- self.index.add(embeddings)
343
- self.chunks = all_chunks
344
- return f"Successfully Processed {len(all_chunks)} chunks from {len(files)} files"
345
- except Exception as e:
346
- print(f"Embedding Error: {str(e)}")
347
- return f"Error: {str(e)}"
348
-
349
- def query(self, question):
350
- if not self.chunks:
351
- return "Please process documents first", False
352
-
353
- try:
354
- question_embedding = MODEL.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
355
- _, indices = self.index.search(question_embedding, 3)
356
- context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
357
-
358
- response = requests.post(
359
- "https://api.groq.com/openai/v1/chat/completions",
360
- headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
361
- json={
362
- "messages": [{
363
- "role": "user",
364
- "content": f"Answer concisely: {question}\nContext: {context}"
365
- }],
366
- "model": "mixtral-8x7b-32768",
367
- "temperature": 0.3,
368
- "max_tokens": MAX_TOKENS,
369
- "stream": True
370
- },
371
- timeout=20
372
- )
373
-
374
- if response.status_code != 200:
375
- return f"API Error: {response.text}", False
376
-
377
- full_answer = []
378
- for chunk in response.iter_lines():
379
- if chunk:
380
- try:
381
- decoded = chunk.decode('utf-8').strip()
382
- if decoded.startswith('data:'):
383
- data = json.loads(decoded[5:])
384
- if content := data.get('choices', [{}])[0].get('delta', {}).get('content', ''):
385
- full_answer.append(content)
386
- except:
387
- continue
388
-
389
- return ''.join(full_answer), True
390
- except Exception as e:
391
- print(f"Query Error: {str(e)}")
392
- return f"Error: {str(e)}", False
393
-
394
- processor = DocumentProcessor()
395
-
396
- def ask_question(question, chat_history):
397
- if not question.strip():
398
- return chat_history
399
- answer, success = processor.query(question)
400
- return chat_history + [(question, answer if success else f"Error: {answer}")]
401
-
402
- with gr.Blocks(title="RAG System", css=".footer {display: none !important}") as app:
403
- gr.Markdown("## Multi-Format-Reader")
404
- with gr.Row():
405
- files = gr.File(file_count="multiple",
406
- file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
407
- label="Upload Documents")
408
- process_btn = gr.Button("Process", variant="primary")
409
- status = gr.Textbox(label="Processing Status", interactive=False)
410
- chatbot = gr.Chatbot(height=500, label="Chat History")
411
- with gr.Row():
412
- question = gr.Textbox(label="Your Query", placeholder="Enter your question...", max_lines=3)
413
- ask_btn = gr.Button("Ask", variant="primary")
414
- clear_btn = gr.Button("Clear Chat")
415
-
416
- process_btn.click(
417
- processor.process_documents,
418
- files,
419
- status
420
- )
421
- ask_btn.click(
422
- ask_question,
423
- [question, chatbot],
424
- chatbot
425
- ).then(lambda: "", None, question)
426
- clear_btn.click(lambda: [], None, chatbot)
427
-
428
- app.launch()
429
- '''
 
200
  return chat_history + [(question, answer)]
201
 
202
  with gr.Blocks(title="RAG System") as app:
203
+ gr.Markdown("## 🚀 Multi-Format-Reader Chat-Bot")
204
  with gr.Row():
205
  files = gr.File(file_count="multiple",
206
  file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
 
235
 
236
  app.launch(share=True, debug=True)
237