gmustafa413 commited on
Commit
7dd7442
·
verified ·
1 Parent(s): 24ea78c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -10
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # Install dependencies
2
-
3
  import gradio as gr
4
  import fitz
5
  import numpy as np
@@ -12,9 +10,10 @@ from docx import Document
12
  from pptx import Presentation
13
  from sentence_transformers import SentenceTransformer
14
  from concurrent.futures import ThreadPoolExecutor
 
15
 
16
- # Configuration
17
- GEMINI_API_KEY = "AIzaSyAPF8eVHU2jRWrQfwD8J9HPz4DrfIWK4GQ" # 🔑 REPLACE WITH YOUR GEMINI KEY
18
  MODEL_NAME = "all-MiniLM-L6-v2"
19
  CHUNK_SIZE = 1024
20
  MAX_TOKENS = 4096
@@ -27,7 +26,97 @@ class DocumentProcessor:
27
  self.chunks = []
28
  self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
29
 
30
- # ... (keep all existing document processing methods unchanged) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def query(self, question):
33
  if not self.chunks:
@@ -71,26 +160,75 @@ class DocumentProcessor:
71
  timeout=20
72
  )
73
 
74
- print(f"API Status Code: {response.status_code}")
75
-
76
  if response.status_code != 200:
77
  return f"API Error: {response.text}", False
78
 
79
- # Parse Gemini response
80
- response_json = response.json()
81
  try:
 
82
  final_answer = response_json['candidates'][0]['content']['parts'][0]['text']
83
  except (KeyError, IndexError) as e:
84
  print(f"Response parsing error: {str(e)}")
85
  return "Error: Could not parse API response", False
86
 
87
- print(f"Final Answer: {final_answer}")
88
  return final_answer, True
89
 
90
  except Exception as e:
91
  print(f"Query Error: {str(e)}")
92
  return f"Error: {str(e)}", False
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  # ... (keep the rest of the Gradio interface code unchanged) ...
95
 
96
  '''
 
 
 
1
  import gradio as gr
2
  import fitz
3
  import numpy as np
 
10
  from pptx import Presentation
11
  from sentence_transformers import SentenceTransformer
12
  from concurrent.futures import ThreadPoolExecutor
13
+ import os
14
 
15
+ # Configuration - Get API key from Hugging Face secrets
16
+ GEMINI_API_KEY = os.environ.get("AIzaSyAPF8eVHU2jRWrQfwD8J9HPz4DrfIWK4GQ")
17
  MODEL_NAME = "all-MiniLM-L6-v2"
18
  CHUNK_SIZE = 1024
19
  MAX_TOKENS = 4096
 
26
  self.chunks = []
27
  self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
28
 
29
+ # File processing methods remain unchanged from original
30
+ def extract_text_from_pptx(self, file_path):
31
+ try:
32
+ prs = Presentation(file_path)
33
+ return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
34
+ except Exception as e:
35
+ print(f"PPTX Error: {str(e)}")
36
+ return ""
37
+
38
+ def extract_text_from_xls_csv(self, file_path):
39
+ try:
40
+ if file_path.endswith(('.xls', '.xlsx')):
41
+ df = pd.read_excel(file_path)
42
+ else:
43
+ df = pd.read_csv(file_path)
44
+ return " ".join(df.astype(str).values.flatten())
45
+ except Exception as e:
46
+ print(f"Spreadsheet Error: {str(e)}")
47
+ return ""
48
+
49
+ def extract_text_from_pdf(self, file_path):
50
+ try:
51
+ doc = fitz.open(file_path)
52
+ return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
53
+ except Exception as e:
54
+ print(f"PDF Error: {str(e)}")
55
+ return ""
56
+
57
+ def process_file(self, file):
58
+ try:
59
+ file_path = file.name
60
+ print(f"Processing: {file_path}")
61
+
62
+ if file_path.endswith('.pdf'):
63
+ text = self.extract_text_from_pdf(file_path)
64
+ elif file_path.endswith('.docx'):
65
+ text = " ".join(p.text for p in Document(file_path).paragraphs)
66
+ elif file_path.endswith('.txt'):
67
+ with open(file_path, 'r', encoding='utf-8') as f:
68
+ text = f.read()
69
+ elif file_path.endswith('.pptx'):
70
+ text = self.extract_text_from_pptx(file_path)
71
+ elif file_path.endswith(('.xls', '.xlsx', '.csv')):
72
+ text = self.extract_text_from_xls_csv(file_path)
73
+ else:
74
+ return ""
75
+
76
+ clean_text = re.sub(r'\s+', ' ', text).strip()
77
+ print(f"Extracted {len(clean_text)} characters from {file_path}")
78
+ return clean_text
79
+ except Exception as e:
80
+ print(f"Processing Error: {str(e)}")
81
+ return ""
82
+
83
+ def semantic_chunking(self, text):
84
+ words = re.findall(r'\S+\s*', text)
85
+ chunks = [''.join(words[i:i+CHUNK_SIZE//2]) for i in range(0, len(words), CHUNK_SIZE//2)]
86
+ return chunks[:1000]
87
+
88
+ def process_documents(self, files):
89
+ self.chunks = []
90
+ if not files:
91
+ return "No files uploaded!"
92
+
93
+ print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
94
+ texts = list(self.processor_pool.map(self.process_file, files))
95
+
96
+ with ThreadPoolExecutor(max_workers=WORKERS) as executor:
97
+ chunk_lists = list(executor.map(self.semantic_chunking, texts))
98
+
99
+ all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
100
+ print(f"Total chunks generated: {len(all_chunks)}")
101
+
102
+ if not all_chunks:
103
+ return "Error: No chunks generated from documents"
104
+
105
+ try:
106
+ embeddings = MODEL.encode(
107
+ all_chunks,
108
+ batch_size=256,
109
+ convert_to_tensor=True,
110
+ show_progress_bar=False
111
+ ).cpu().numpy().astype('float32')
112
+
113
+ self.index.reset()
114
+ self.index.add(embeddings)
115
+ self.chunks = all_chunks
116
+ return f"Processed {len(all_chunks)} chunks from {len(files)} files"
117
+ except Exception as e:
118
+ print(f"Embedding Error: {str(e)}")
119
+ return f"Error: {str(e)}"
120
 
121
  def query(self, question):
122
  if not self.chunks:
 
160
  timeout=20
161
  )
162
 
 
 
163
  if response.status_code != 200:
164
  return f"API Error: {response.text}", False
165
 
166
+ # Parse response
 
167
  try:
168
+ response_json = response.json()
169
  final_answer = response_json['candidates'][0]['content']['parts'][0]['text']
170
  except (KeyError, IndexError) as e:
171
  print(f"Response parsing error: {str(e)}")
172
  return "Error: Could not parse API response", False
173
 
 
174
  return final_answer, True
175
 
176
  except Exception as e:
177
  print(f"Query Error: {str(e)}")
178
  return f"Error: {str(e)}", False
179
 
180
+ # Initialize processor
181
+ processor = DocumentProcessor()
182
+
183
+ # Gradio interface with improved error handling
184
+ with gr.Blocks(theme=gr.themes.Soft(), title="Chatbot") as app:
185
+ gr.Markdown("## 📚 Multi-Format Document Chatbot")
186
+
187
+ with gr.Row():
188
+ with gr.Column(scale=2):
189
+ files = gr.File(
190
+ file_count="multiple",
191
+ file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
192
+ label="Upload Documents",
193
+ max_size=500*1024*1024 # 500MB limit
194
+ )
195
+ process_btn = gr.Button("Process Documents", variant="primary")
196
+ status = gr.Textbox(label="Processing Status")
197
+
198
+ with gr.Column(scale=3):
199
+ chatbot = gr.Chatbot(height=500, label="Chat History")
200
+ question = gr.Textbox(
201
+ label="Ask a question",
202
+ placeholder="Type your question here...",
203
+ max_lines=3
204
+ )
205
+ with gr.Row():
206
+ ask_btn = gr.Button("Ask", variant="primary")
207
+ clear_btn = gr.Button("Clear Chat")
208
+
209
+ process_btn.click(
210
+ fn=processor.process_documents,
211
+ inputs=files,
212
+ outputs=status,
213
+ api_name="process_documents"
214
+ )
215
+
216
+ ask_btn.click(
217
+ fn=lambda q, h: ask_question(q, h),
218
+ inputs=[question, chatbot],
219
+ outputs=chatbot,
220
+ api_name="ask_question"
221
+ ).then(lambda: "", None, question)
222
+
223
+ clear_btn.click(
224
+ fn=lambda: [],
225
+ inputs=None,
226
+ outputs=chatbot,
227
+ api_name="clear_chat"
228
+ )
229
+
230
+ if __name__ == "__main__":
231
+ app.launch(debug=True)
232
  # ... (keep the rest of the Gradio interface code unchanged) ...
233
 
234
  '''