gmustafa413 commited on
Commit
38923df
·
verified ·
1 Parent(s): 886e1a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -235
app.py CHANGED
@@ -10,240 +10,6 @@ from docx import Document
10
  from pptx import Presentation
11
  from sentence_transformers import SentenceTransformer
12
  from concurrent.futures import ThreadPoolExecutor
13
- import os
14
-
15
- # Configuration - Get API key from Hugging Face secrets
16
- GEMINI_API_KEY = os.environ.get("AIzaSyAPF8eVHU2jRWrQfwD8J9HPz4DrfIWK4GQ")
17
- MODEL_NAME = "all-MiniLM-L6-v2"
18
- CHUNK_SIZE = 1024
19
- MAX_TOKENS = 4096
20
- MODEL = SentenceTransformer(MODEL_NAME)
21
- WORKERS = 8
22
-
23
- class DocumentProcessor:
24
- def __init__(self):
25
- self.index = faiss.IndexFlatIP(MODEL.get_sentence_embedding_dimension())
26
- self.chunks = []
27
- self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
28
-
29
- # File processing methods remain unchanged from original
30
- def extract_text_from_pptx(self, file_path):
31
- try:
32
- prs = Presentation(file_path)
33
- return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
34
- except Exception as e:
35
- print(f"PPTX Error: {str(e)}")
36
- return ""
37
-
38
- def extract_text_from_xls_csv(self, file_path):
39
- try:
40
- if file_path.endswith(('.xls', '.xlsx')):
41
- df = pd.read_excel(file_path)
42
- else:
43
- df = pd.read_csv(file_path)
44
- return " ".join(df.astype(str).values.flatten())
45
- except Exception as e:
46
- print(f"Spreadsheet Error: {str(e)}")
47
- return ""
48
-
49
- def extract_text_from_pdf(self, file_path):
50
- try:
51
- doc = fitz.open(file_path)
52
- return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
53
- except Exception as e:
54
- print(f"PDF Error: {str(e)}")
55
- return ""
56
-
57
- def process_file(self, file):
58
- try:
59
- file_path = file.name
60
- print(f"Processing: {file_path}")
61
-
62
- if file_path.endswith('.pdf'):
63
- text = self.extract_text_from_pdf(file_path)
64
- elif file_path.endswith('.docx'):
65
- text = " ".join(p.text for p in Document(file_path).paragraphs)
66
- elif file_path.endswith('.txt'):
67
- with open(file_path, 'r', encoding='utf-8') as f:
68
- text = f.read()
69
- elif file_path.endswith('.pptx'):
70
- text = self.extract_text_from_pptx(file_path)
71
- elif file_path.endswith(('.xls', '.xlsx', '.csv')):
72
- text = self.extract_text_from_xls_csv(file_path)
73
- else:
74
- return ""
75
-
76
- clean_text = re.sub(r'\s+', ' ', text).strip()
77
- print(f"Extracted {len(clean_text)} characters from {file_path}")
78
- return clean_text
79
- except Exception as e:
80
- print(f"Processing Error: {str(e)}")
81
- return ""
82
-
83
- def semantic_chunking(self, text):
84
- words = re.findall(r'\S+\s*', text)
85
- chunks = [''.join(words[i:i+CHUNK_SIZE//2]) for i in range(0, len(words), CHUNK_SIZE//2)]
86
- return chunks[:1000]
87
-
88
- def process_documents(self, files):
89
- self.chunks = []
90
- if not files:
91
- return "No files uploaded!"
92
-
93
- print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
94
- texts = list(self.processor_pool.map(self.process_file, files))
95
-
96
- with ThreadPoolExecutor(max_workers=WORKERS) as executor:
97
- chunk_lists = list(executor.map(self.semantic_chunking, texts))
98
-
99
- all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
100
- print(f"Total chunks generated: {len(all_chunks)}")
101
-
102
- if not all_chunks:
103
- return "Error: No chunks generated from documents"
104
-
105
- try:
106
- embeddings = MODEL.encode(
107
- all_chunks,
108
- batch_size=256,
109
- convert_to_tensor=True,
110
- show_progress_bar=False
111
- ).cpu().numpy().astype('float32')
112
-
113
- self.index.reset()
114
- self.index.add(embeddings)
115
- self.chunks = all_chunks
116
- return f"Processed {len(all_chunks)} chunks from {len(files)} files"
117
- except Exception as e:
118
- print(f"Embedding Error: {str(e)}")
119
- return f"Error: {str(e)}"
120
-
121
- def query(self, question):
122
- if not self.chunks:
123
- return "Please process documents first", False
124
-
125
- try:
126
- print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
127
- print(f"Question: {question}")
128
-
129
- # Generate embedding for the question
130
- question_embedding = MODEL.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
131
-
132
- # Search FAISS index
133
- _, indices = self.index.search(question_embedding, 3)
134
- print(f"Top indices: {indices}")
135
-
136
- # Get context from top chunks
137
- context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
138
- print(f"Context length: {len(context)} characters")
139
-
140
- # Gemini API Call
141
- url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={GEMINI_API_KEY}"
142
- headers = {"Content-Type": "application/json"}
143
-
144
- payload = {
145
- "contents": [{
146
- "parts": [{
147
- "text": f"Answer concisely based on this context: {context}\n\nQuestion: {question}"
148
- }]
149
- }],
150
- "generationConfig": {
151
- "temperature": 0.3,
152
- "maxOutputTokens": MAX_TOKENS
153
- }
154
- }
155
-
156
- response = requests.post(
157
- url,
158
- headers=headers,
159
- json=payload,
160
- timeout=20
161
- )
162
-
163
- if response.status_code != 200:
164
- return f"API Error: {response.text}", False
165
-
166
- # Parse response
167
- try:
168
- response_json = response.json()
169
- final_answer = response_json['candidates'][0]['content']['parts'][0]['text']
170
- except (KeyError, IndexError) as e:
171
- print(f"Response parsing error: {str(e)}")
172
- return "Error: Could not parse API response", False
173
-
174
- return final_answer, True
175
-
176
- except Exception as e:
177
- print(f"Query Error: {str(e)}")
178
- return f"Error: {str(e)}", False
179
-
180
- # Initialize processor
181
- processor = DocumentProcessor()
182
-
183
- # Gradio interface with improved error handling
184
- with gr.Blocks(theme=gr.themes.Soft(), title="Chatbot") as app:
185
- gr.Markdown("## 📚 Multi-Format Document Chatbot")
186
-
187
- with gr.Row():
188
- with gr.Column(scale=2):
189
- files = gr.File(
190
- file_count="multiple",
191
- file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
192
- label="Upload Documents",
193
- max_size=500*1024*1024 # 500MB limit
194
- )
195
- process_btn = gr.Button("Process Documents", variant="primary")
196
- status = gr.Textbox(label="Processing Status")
197
-
198
- with gr.Column(scale=3):
199
- chatbot = gr.Chatbot(height=500, label="Chat History")
200
- question = gr.Textbox(
201
- label="Ask a question",
202
- placeholder="Type your question here...",
203
- max_lines=3
204
- )
205
- with gr.Row():
206
- ask_btn = gr.Button("Ask", variant="primary")
207
- clear_btn = gr.Button("Clear Chat")
208
-
209
- process_btn.click(
210
- fn=processor.process_documents,
211
- inputs=files,
212
- outputs=status,
213
- api_name="process_documents"
214
- )
215
-
216
- ask_btn.click(
217
- fn=lambda q, h: ask_question(q, h),
218
- inputs=[question, chatbot],
219
- outputs=chatbot,
220
- api_name="ask_question"
221
- ).then(lambda: "", None, question)
222
-
223
- clear_btn.click(
224
- fn=lambda: [],
225
- inputs=None,
226
- outputs=chatbot,
227
- api_name="clear_chat"
228
- )
229
-
230
- if __name__ == "__main__":
231
- app.launch(debug=True)
232
- # ... (keep the rest of the Gradio interface code unchanged) ...
233
-
234
- '''
235
- import gradio as gr
236
- import fitz
237
- import numpy as np
238
- import requests
239
- import faiss
240
- import re
241
- import json
242
- import pandas as pd
243
- from docx import Document
244
- from pptx import Presentation
245
- from sentence_transformers import SentenceTransformer
246
- from concurrent.futures import ThreadPoolExecutor
247
 
248
  # Configuration
249
  GROQ_API_KEY = "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" # 🔑 REPLACE WITH YOUR ACTUAL KEY
@@ -465,4 +231,3 @@ with gr.Blocks(title="RAG System") as app:
465
  )
466
 
467
  app.launch(share=True, debug=True)
468
- '''
 
10
  from pptx import Presentation
11
  from sentence_transformers import SentenceTransformer
12
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Configuration
15
  GROQ_API_KEY = "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" # 🔑 REPLACE WITH YOUR ACTUAL KEY
 
231
  )
232
 
233
  app.launch(share=True, debug=True)