Zahid0123 commited on
Commit
9bb6323
ยท
verified ยท
1 Parent(s): 04769eb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +870 -0
app.py ADDED
@@ -0,0 +1,870 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===================================================================
2
+ # AI Research Agent - Agentic RAG System for Hugging Face Spaces
3
+ # ===================================================================
4
+ import os
5
+ import re
6
+ import json
7
+ import ast
8
+ import operator
9
+ import logging
10
+ import requests
11
+ import tempfile
12
+ import time
13
+ import asyncio
14
+ from pathlib import Path
15
+ from typing import List, Dict, Any, Optional
16
+ from datetime import datetime
17
+ from urllib.parse import quote_plus
18
+
19
+ # Core Libraries
20
+ import numpy as np
21
+ import pandas as pd
22
+ from tqdm import tqdm
23
+
24
+ # ML & Embedding
25
+ import PyPDF2
26
+ from sentence_transformers import SentenceTransformer
27
+ import faiss
28
+
29
+ # LLM & Web
30
+ import groq
31
+ from groq import Groq
32
+
33
+ # UI & Voice
34
+ import gradio as gr
35
+ from gtts import gTTS
36
+ try:
37
+ import speech_recognition as sr
38
+ STT_AVAILABLE = True
39
+ except ImportError:
40
+ STT_AVAILABLE = False
41
+
42
+ GTTS_AVAILABLE = True
43
+
44
+ # ===================================================================
45
+ # CONFIGURATION & LOGGING
46
+ # ===================================================================
47
+ logging.basicConfig(level=logging.INFO)
48
+ logger = logging.getLogger(__name__)
49
+
50
+ # ===================================================================
51
+ # UTILITY CLASSES
52
+ # ===================================================================
53
+
54
+ class WebSearchTool:
55
+ def __init__(self, max_results: int = 5, timeout: int = 10):
56
+ self.max_results = max_results
57
+ self.timeout = timeout
58
+ self.base_url = "https://api.duckduckgo.com/"
59
+
60
+ def search(self, query: str, num_results: Optional[int] = None) -> Dict[str, Any]:
61
+ num_results = num_results or self.max_results
62
+ try:
63
+ params = {
64
+ 'q': query,
65
+ 'format': 'json',
66
+ 'no_redirect': '1',
67
+ 'no_html': '1',
68
+ 'skip_disambig': '1'
69
+ }
70
+ response = requests.get(self.base_url, params=params, timeout=self.timeout,
71
+ headers={'User-Agent': 'AI Research Agent 1.0'})
72
+ response.raise_for_status()
73
+ data = response.json()
74
+
75
+ results = {
76
+ 'query': query,
77
+ 'abstract': data.get('Abstract', ''),
78
+ 'abstract_source': data.get('AbstractSource', ''),
79
+ 'answer': data.get('Answer', ''),
80
+ 'related_topics': [],
81
+ 'results_found': bool(any([data.get('Abstract'), data.get('Answer')]))
82
+ }
83
+
84
+ if 'RelatedTopics' in data:
85
+ for topic in data['RelatedTopics'][:num_results]:
86
+ if isinstance(topic, dict) and 'Text' in topic:
87
+ results['related_topics'].append({
88
+ 'text': topic.get('Text', ''),
89
+ 'url': topic.get('FirstURL', '')
90
+ })
91
+ return results
92
+ except Exception as e:
93
+ logger.error(f"Web search failed: {e}")
94
+ return {'query': query, 'error': str(e), 'results_found': False}
95
+
96
+
97
+ class ConfigManager:
98
+ DEFAULT_CONFIG = {
99
+ 'embedding_model': 'all-MiniLM-L6-v2',
100
+ 'groq_model': 'llama-3.1-8b-instant',
101
+ 'max_iterations': 5,
102
+ 'confidence_threshold': 0.7,
103
+ 'retrieval_k': 5,
104
+ 'chunk_size': 512,
105
+ 'chunk_overlap': 50
106
+ }
107
+
108
+ @staticmethod
109
+ def load_config():
110
+ return ConfigManager.DEFAULT_CONFIG.copy()
111
+
112
+
113
+ # ===================================================================
114
+ # DOCUMENT PROCESSING
115
+ # ===================================================================
116
+
117
+ class DocumentProcessor:
118
+ def __init__(self):
119
+ self.supported_extensions = {'.txt', '.md', '.pdf'}
120
+
121
+ def load_documents(self, data_directory: str) -> List[Dict[str, Any]]:
122
+ documents = []
123
+ data_path = Path(data_directory)
124
+ if not data_path.exists():
125
+ return documents
126
+
127
+ files = [f for f in data_path.rglob('*') if f.suffix.lower() in self.supported_extensions]
128
+ for file_path in tqdm(files, desc="Loading documents"):
129
+ try:
130
+ content = self._extract_text(file_path)
131
+ if content.strip():
132
+ doc = {
133
+ 'doc_id': str(file_path.relative_to(data_path)),
134
+ 'content': content,
135
+ 'file_path': str(file_path),
136
+ 'file_type': file_path.suffix.lower()
137
+ }
138
+ documents.append(doc)
139
+ except Exception as e:
140
+ logger.error(f"Error loading {file_path}: {e}")
141
+ return documents
142
+
143
+ def _extract_text(self, file_path: Path) -> str:
144
+ extension = file_path.suffix.lower()
145
+ if extension == '.txt':
146
+ with open(file_path, 'r', encoding='utf-8') as f:
147
+ return f.read()
148
+ elif extension == '.pdf':
149
+ text = ""
150
+ with open(file_path, 'rb') as f:
151
+ pdf_reader = PyPDF2.PdfReader(f)
152
+ for page in pdf_reader.pages:
153
+ text += page.extract_text() + "\n"
154
+ return text
155
+ return ""
156
+
157
+
158
+ class DocumentChunker:
159
+ def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
160
+ self.chunk_size = chunk_size
161
+ self.chunk_overlap = chunk_overlap
162
+
163
+ def chunk_documents(self, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
164
+ chunks = []
165
+ for doc in tqdm(documents, desc="Chunking documents"):
166
+ doc_chunks = self._split_text(doc['content'])
167
+ for i, chunk_text in enumerate(doc_chunks):
168
+ chunk = {
169
+ 'chunk_id': f"{doc['doc_id']}_chunk_{i}",
170
+ 'content': chunk_text,
171
+ 'doc_id': doc['doc_id'],
172
+ 'chunk_index': i,
173
+ 'source_file': doc['file_path'],
174
+ 'file_type': doc['file_type']
175
+ }
176
+ chunks.append(chunk)
177
+ return chunks
178
+
179
+ def _split_text(self, text: str) -> List[str]:
180
+ text = re.sub(r'\s+', ' ', text.strip())
181
+ if len(text) <= self.chunk_size:
182
+ return [text]
183
+
184
+ chunks = []
185
+ start = 0
186
+ while start < len(text):
187
+ end = start + self.chunk_size
188
+ if end >= len(text):
189
+ chunks.append(text[start:])
190
+ break
191
+
192
+ chunk = text[start:end]
193
+ last_sentence = max(chunk.rfind('.'), chunk.rfind('!'), chunk.rfind('?'))
194
+ if last_sentence > start + self.chunk_size // 2:
195
+ end = start + last_sentence + 1
196
+ else:
197
+ last_space = chunk.rfind(' ')
198
+ if last_space > start + self.chunk_size // 2:
199
+ end = start + last_space
200
+
201
+ chunks.append(text[start:end].strip())
202
+ start = end - self.chunk_overlap
203
+
204
+ return [chunk for chunk in chunks if len(chunk.strip()) > 10]
205
+
206
+
207
+ class EmbeddingGenerator:
208
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
209
+ self.model_name = model_name
210
+ self.model = SentenceTransformer(model_name)
211
+
212
+ def generate_embeddings(self, chunks: List[Dict[str, Any]]) -> np.ndarray:
213
+ texts = [chunk['content'] for chunk in chunks]
214
+ embeddings = self.model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)
215
+ return embeddings
216
+
217
+ def get_query_embedding(self, query: str) -> np.ndarray:
218
+ return self.model.encode([query], convert_to_numpy=True)[0]
219
+
220
+
221
+ def build_embeddings_from_directory(data_directory: str, output_directory: str,
222
+ chunk_size: int = 512, chunk_overlap: int = 50) -> Dict[str, Any]:
223
+ os.makedirs(output_directory, exist_ok=True)
224
+ doc_processor = DocumentProcessor()
225
+ chunker = DocumentChunker(chunk_size, chunk_overlap)
226
+ embedder = EmbeddingGenerator()
227
+
228
+ documents = doc_processor.load_documents(data_directory)
229
+ if not documents:
230
+ return {}
231
+
232
+ chunks = chunker.chunk_documents(documents)
233
+ embeddings = embedder.generate_embeddings(chunks)
234
+
235
+ return {
236
+ 'chunks': chunks,
237
+ 'embeddings': embeddings,
238
+ 'metadata': {
239
+ 'num_documents': len(documents),
240
+ 'num_chunks': len(chunks),
241
+ 'embedding_dim': embeddings.shape[1]
242
+ }
243
+ }
244
+
245
+
246
+ # ===================================================================
247
+ # RETRIEVER
248
+ # ===================================================================
249
+
250
+ class DocumentRetriever:
251
+ def __init__(self, embedding_model_name: str = 'all-MiniLM-L6-v2'):
252
+ self.embedding_generator = EmbeddingGenerator(embedding_model_name)
253
+ self.index = None
254
+ self.chunks = []
255
+ self.embeddings = None
256
+
257
+ def build_index(self, chunks: List[Dict[str, Any]], embeddings: np.ndarray) -> None:
258
+ self.chunks = chunks
259
+ self.embeddings = embeddings
260
+ embedding_dim = embeddings.shape[1]
261
+ self.index = faiss.IndexFlatIP(embedding_dim)
262
+ embeddings_normalized = self._normalize_embeddings(embeddings)
263
+ self.index.add(embeddings_normalized.astype(np.float32))
264
+
265
+ def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
266
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
267
+ norms[norms == 0] = 1
268
+ return embeddings / norms
269
+
270
+ def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
271
+ if not self.index:
272
+ return []
273
+
274
+ query_embedding = self.embedding_generator.get_query_embedding(query)
275
+ query_embedding_normalized = self._normalize_embeddings(query_embedding.reshape(1, -1))
276
+ scores, indices = self.index.search(query_embedding_normalized.astype(np.float32), k)
277
+
278
+ results = []
279
+ for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
280
+ if idx >= 0:
281
+ chunk = self.chunks[idx].copy()
282
+ chunk.update({'similarity_score': float(score), 'rank': i + 1})
283
+ results.append(chunk)
284
+ return results
285
+
286
+
287
+ # ===================================================================
288
+ # AGENTIC TOOLS
289
+ # ===================================================================
290
+
291
+ class AgenticTools:
292
+ def __init__(self):
293
+ self.tools = {
294
+ "calculator": self.calculator_tool,
295
+ "web_search": self.web_search_tool,
296
+ "fact_checker": self.fact_checker_tool,
297
+ "document_analyzer": self.document_analyzer_tool
298
+ }
299
+ self.web_search_instance = WebSearchTool()
300
+
301
+ def calculator_tool(self, expression: str) -> Dict[str, Any]:
302
+ try:
303
+ clean_expr = re.sub(r'[^0-9+\-*/().\s]', '', expression)
304
+ node = ast.parse(clean_expr, mode='eval')
305
+ result = self._eval_expr(node.body)
306
+ return {
307
+ "tool": "calculator",
308
+ "input": expression,
309
+ "result": result,
310
+ "success": True,
311
+ "explanation": f"Calculated {clean_expr} = {result}"
312
+ }
313
+ except Exception as e:
314
+ return {"tool": "calculator", "input": expression, "result": None, "success": False, "error": str(e)}
315
+
316
+ def _eval_expr(self, node):
317
+ ops = {
318
+ ast.Add: operator.add, ast.Sub: operator.sub,
319
+ ast.Mult: operator.mul, ast.Div: operator.truediv,
320
+ ast.Pow: operator.pow, ast.USub: operator.neg
321
+ }
322
+ if isinstance(node, ast.Num):
323
+ return node.n
324
+ elif isinstance(node, ast.BinOp):
325
+ return ops[type(node.op)](self._eval_expr(node.left), self._eval_expr(node.right))
326
+ elif isinstance(node, ast.UnaryOp):
327
+ return ops[type(node.op)](self._eval_expr(node.operand))
328
+ raise TypeError(node)
329
+
330
+ def web_search_tool(self, query: str) -> Dict[str, Any]:
331
+ try:
332
+ result = self.web_search_instance.search(query)
333
+ return {
334
+ "tool": "web_search",
335
+ "input": query,
336
+ "result": result,
337
+ "success": result.get('results_found', False),
338
+ "explanation": f"Found web information about: {query}"
339
+ }
340
+ except Exception as e:
341
+ return {"tool": "web_search", "input": query, "result": None, "success": False, "error": str(e)}
342
+
343
+ def fact_checker_tool(self, claim: str) -> Dict[str, Any]:
344
+ confidence = "medium"
345
+ verification = "partial"
346
+ if re.search(r'\d+', claim):
347
+ verification = "requires_calculation"
348
+ return {
349
+ "tool": "fact_checker",
350
+ "input": claim,
351
+ "result": {"verification": verification, "confidence": confidence},
352
+ "success": True
353
+ }
354
+
355
+ def document_analyzer_tool(self, text: str, analysis_type: str = "summary") -> Dict[str, Any]:
356
+ sentences = re.split(r'[.!?]+', text)[:3]
357
+ summary = '. '.join([s.strip() for s in sentences if s.strip()])
358
+ return {
359
+ "tool": "document_analyzer",
360
+ "input": f"{analysis_type} analysis",
361
+ "result": summary,
362
+ "success": True
363
+ }
364
+
365
+
366
+ class AgentPlanner:
367
+ def __init__(self):
368
+ self.planning_patterns = {
369
+ "calculation": ["calculate", "compute", "math", "percentage", "total"],
370
+ "current_info": ["latest", "recent", "current", "rate", "price", "exchange", "dollar", "currency"],
371
+ "analysis": ["analyze", "insights", "patterns", "summary"],
372
+ "fact_check": ["verify", "confirm", "accurate"]
373
+ }
374
+
375
+ def create_execution_plan(self, query: str) -> Dict[str, Any]:
376
+ query_lower = query.lower()
377
+ needed_capabilities = []
378
+ for capability, keywords in self.planning_patterns.items():
379
+ if any(keyword in query_lower for keyword in keywords):
380
+ needed_capabilities.append(capability)
381
+
382
+ steps = [{"step": 1, "tool": "document_search", "description": "Search documents", "query": query}]
383
+ step_num = 2
384
+
385
+ if "calculation" in needed_capabilities:
386
+ steps.append({"step": step_num, "tool": "calculator", "description": "Perform calculations", "depends_on": [1]})
387
+ step_num += 1
388
+ if "current_info" in needed_capabilities:
389
+ steps.append({"step": step_num, "tool": "web_search", "description": "Search web", "query": query, "depends_on": [1]})
390
+ step_num += 1
391
+ if "analysis" in needed_capabilities:
392
+ steps.append({"step": step_num, "tool": "document_analyzer", "description": "Analyze content", "depends_on": [1]})
393
+ step_num += 1
394
+
395
+ steps.append({"step": step_num, "tool": "synthesizer", "description": "Synthesize results", "depends_on": list(range(1, step_num))})
396
+
397
+ return {"query": query, "detected_needs": needed_capabilities, "steps": steps, "total_steps": len(steps)}
398
+
399
+
400
+ class ResultSynthesizer:
401
+ def __init__(self, groq_client):
402
+ self.groq_client = groq_client
403
+
404
+ def synthesize_results(self, query: str, results: Dict[str, Any], temperature: float = 0.3, max_tokens: int = 500) -> str:
405
+ context_parts = []
406
+ if "document_search" in results and results["document_search"]["success"]:
407
+ context_parts.append(f"DOCUMENTS:\n{results['document_search']['result']}")
408
+ if "web_search" in results and results["web_search"]["success"]:
409
+ web_info = results["web_search"]["result"]
410
+ web_text = f"{web_info.get('abstract', '')} {web_info.get('answer', '')}"
411
+ context_parts.append(f"WEB INFO:\n{web_text}")
412
+ if "calculator" in results and results["calculator"]["success"]:
413
+ context_parts.append(f"CALCULATION:\n{results['calculator']['result']}")
414
+
415
+ all_context = "\n\n".join(context_parts)
416
+ prompt = f"""Based on the following information, provide a comprehensive answer.
417
+
418
+ QUESTION: {query}
419
+
420
+ INFORMATION:
421
+ {all_context}
422
+
423
+ Provide a clear, direct answer synthesizing all sources."""
424
+
425
+ try:
426
+ response = self.groq_client.chat.completions.create(
427
+ model="llama-3.1-8b-instant",
428
+ messages=[
429
+ {"role": "system", "content": "You are an expert research assistant."},
430
+ {"role": "user", "content": prompt}
431
+ ],
432
+ temperature=temperature,
433
+ max_tokens=max_tokens
434
+ )
435
+ return response.choices[0].message.content.strip()
436
+ except Exception as e:
437
+ return f"Based on available information: {all_context[:500]}..."
438
+
439
+
440
+ class AgenticEvaluator:
441
+ def evaluate_response(self, query: str, response: str, tool_results: Dict[str, Any]) -> Dict[str, Any]:
442
+ successful_tools = sum(1 for r in tool_results.values() if r.get("success", False))
443
+ total_tools = len(tool_results)
444
+
445
+ confidence = min(0.8, successful_tools / max(total_tools, 1)) if successful_tools > 0 else 0.0
446
+ source_types = []
447
+ if "document_search" in tool_results and tool_results["document_search"]["success"]:
448
+ source_types.append("documents")
449
+ if "web_search" in tool_results and tool_results["web_search"]["success"]:
450
+ source_types.append("web")
451
+
452
+ return {
453
+ "confidence_score": confidence,
454
+ "completeness": "comprehensive" if successful_tools >= total_tools else "partial",
455
+ "source_diversity": len(source_types),
456
+ "recommendations": []
457
+ }
458
+
459
+
460
+ # ===================================================================
461
+ # MAIN AGENT CLASS
462
+ # ===================================================================
463
+
464
+ class AgenticRAGAgent:
465
+ def __init__(self):
466
+ self.config = ConfigManager.load_config()
467
+ self.retriever = None
468
+ self.groq_client = None
469
+ self.conversation_history = []
470
+
471
+ self.tools = AgenticTools()
472
+ self.planner = AgentPlanner()
473
+ self.synthesizer = None
474
+ self.evaluator = AgenticEvaluator()
475
+
476
+ self.temperature = 0.3
477
+ self.max_tokens = 500
478
+ self.chunk_size = 512
479
+ self.chunk_overlap = 50
480
+ self.retrieval_k = 8
481
+
482
+ self.enable_web_search = True
483
+ self.enable_calculations = True
484
+ self.enable_fact_checking = True
485
+ self.enable_analysis = True
486
+
487
+ # Initialize Groq
488
+ groq_api_key = os.getenv("GROQ_API_KEY")
489
+ if groq_api_key:
490
+ try:
491
+ self.groq_client = Groq(api_key=groq_api_key)
492
+ self.synthesizer = ResultSynthesizer(self.groq_client)
493
+ print("โœ… Groq API configured")
494
+ except Exception as e:
495
+ print(f"โŒ Error: {e}")
496
+
497
+ def clean_text_for_speech(self, text):
498
+ """Clean text for TTS"""
499
+ if not text:
500
+ return ""
501
+
502
+ # Remove markdown formatting
503
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
504
+ text = re.sub(r'\*([^*]+)\*', r'\1', text)
505
+ text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
506
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
507
+ text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
508
+ text = re.sub(r'`([^`]+)`', r'\1', text)
509
+ text = re.sub(r'^[\s]*[-*+โ€ข]\s+', '', text, flags=re.MULTILINE)
510
+ text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
511
+
512
+ # Remove emojis
513
+ emoji_pattern = re.compile(
514
+ "["
515
+ "\U0001F600-\U0001F64F"
516
+ "\U0001F300-\U0001F5FF"
517
+ "\U0001F680-\U0001F6FF"
518
+ "\U0001F1E0-\U0001F1FF"
519
+ "\U00002702-\U000027B0"
520
+ "\U000024C2-\U0001F251"
521
+ "\U0001F900-\U0001F9FF"
522
+ "\U00002600-\U000026FF"
523
+ "\U00002700-\U000027BF"
524
+ "]+"
525
+ )
526
+ text = emoji_pattern.sub('', text)
527
+ text = re.sub(r'\s+', ' ', text)
528
+ text = re.sub(r'\n+', '. ', text)
529
+ text = text.strip()
530
+ text = re.sub(r'\.+', '.', text)
531
+
532
+ return text
533
+
534
+ def generate_audio_response(self, text):
535
+ """Generate audio using gTTS"""
536
+ if not text or not GTTS_AVAILABLE:
537
+ return None
538
+
539
+ clean_text = self.clean_text_for_speech(text)
540
+ if not clean_text:
541
+ return None
542
+
543
+ try:
544
+ temp_dir = tempfile.gettempdir()
545
+ timestamp = int(time.time())
546
+ audio_file = os.path.join(temp_dir, f"response_{timestamp}.mp3")
547
+
548
+ tts = gTTS(text=clean_text, lang='en', slow=False)
549
+ tts.save(audio_file)
550
+ return audio_file
551
+ except Exception as e:
552
+ logger.error(f"Audio generation failed: {e}")
553
+ return None
554
+
555
+ def is_greeting_or_casual(self, query):
556
+ query_lower = query.lower().strip()
557
+ greetings = ['hi', 'hello', 'hey', 'howdy']
558
+ return any(query_lower.startswith(g) for g in greetings) or query_lower in greetings
559
+
560
+ def get_greeting_response(self, query):
561
+ return "Hi there! ๐Ÿ‘‹ I'm AI Research Agent with agentic capabilities. Upload PDF documents and ask complex questions!"
562
+
563
+ def get_simple_answer(self, query, retrieved_docs):
564
+ if not self.groq_client:
565
+ return "Error: Groq API not configured"
566
+
567
+ context = "\n\n".join([doc.get('content', str(doc)) for doc in retrieved_docs[:5]])
568
+ prompt = f"""Based on this context, provide a clear answer.
569
+
570
+ Context: {context}
571
+
572
+ Question: {query}
573
+
574
+ Answer:"""
575
+
576
+ try:
577
+ response = self.groq_client.chat.completions.create(
578
+ model="llama-3.1-8b-instant",
579
+ messages=[
580
+ {"role": "system", "content": "You are a helpful research assistant."},
581
+ {"role": "user", "content": prompt}
582
+ ],
583
+ temperature=self.temperature,
584
+ max_tokens=self.max_tokens
585
+ )
586
+ return response.choices[0].message.content.strip()
587
+ except Exception as e:
588
+ return f"Error: {str(e)}"
589
+
590
+ async def process_agentic_query(self, query, chat_history, progress=gr.Progress()):
591
+ if not query.strip():
592
+ return chat_history, "", None
593
+
594
+ if chat_history is None:
595
+ chat_history = []
596
+
597
+ chat_history.append({"role": "user", "content": query})
598
+
599
+ try:
600
+ if self.is_greeting_or_casual(query):
601
+ progress(0.5, desc="Generating response...")
602
+ response = self.get_greeting_response(query)
603
+ chat_history.append({"role": "assistant", "content": response})
604
+
605
+ progress(0.8, desc="๐Ÿ”Š Generating voice...")
606
+ audio_file = self.generate_audio_response(response)
607
+
608
+ return chat_history, "", audio_file
609
+
610
+ progress(0.1, desc="๐Ÿง  Planning...")
611
+
612
+ if not self.retriever or not hasattr(self.retriever, 'index') or not self.retriever.index:
613
+ error = "๐Ÿ“„ Please upload a PDF document first!"
614
+ chat_history.append({"role": "assistant", "content": error})
615
+ audio_file = self.generate_audio_response(error)
616
+ return chat_history, "", audio_file
617
+
618
+ plan = self.planner.create_execution_plan(query)
619
+ progress(0.2, desc=f"๐Ÿ“‹ Plan: {len(plan['steps'])} steps")
620
+
621
+ results = {}
622
+ current_step = 0
623
+
624
+ for step in plan['steps']:
625
+ current_step += 1
626
+ progress_val = 0.2 + (current_step / len(plan['steps'])) * 0.6
627
+ progress(progress_val, desc=f"๐Ÿ”ง Step {current_step}: {step['description']}")
628
+
629
+ if step['tool'] == 'document_search':
630
+ retrieved_docs = self.retriever.search(query, k=self.retrieval_k)
631
+ if retrieved_docs:
632
+ doc_answer = self.get_simple_answer(query, retrieved_docs)
633
+ results['document_search'] = {"success": True, "result": doc_answer}
634
+ else:
635
+ results['document_search'] = {"success": False, "result": "No relevant info"}
636
+
637
+ elif step['tool'] == 'calculator' and self.enable_calculations:
638
+ math_patterns = re.findall(r'[\d+\-*/().\s]+', query)
639
+ for expr in math_patterns:
640
+ if any(op in expr for op in ['+', '-', '*', '/']):
641
+ results['calculator'] = self.tools.calculator_tool(expr.strip())
642
+ break
643
+
644
+ elif step['tool'] == 'web_search' and self.enable_web_search:
645
+ results['web_search'] = self.tools.web_search_tool(query)
646
+
647
+ elif step['tool'] == 'document_analyzer' and self.enable_analysis:
648
+ if 'document_search' in results and results['document_search']['success']:
649
+ doc_content = results['document_search']['result']
650
+ results['document_analyzer'] = self.tools.document_analyzer_tool(doc_content, "summary")
651
+
652
+ progress(0.85, desc="๐Ÿ”ฌ Synthesizing...")
653
+
654
+ if self.synthesizer:
655
+ final_answer = self.synthesizer.synthesize_results(query, results, self.temperature, self.max_tokens)
656
+ else:
657
+ successful = [r['result'] for r in results.values() if r.get('success')]
658
+ final_answer = f"Based on available info: {' '.join(map(str, successful))}"
659
+
660
+ progress(0.9, desc="๐Ÿ“Š Evaluating...")
661
+ evaluation = self.evaluator.evaluate_response(query, final_answer, results)
662
+
663
+ eval_summary = f"\n\n๐Ÿ’ก **Analysis:**\n"
664
+ eval_summary += f"โ€ข Confidence: {evaluation['confidence_score']:.1%}\n"
665
+ eval_summary += f"โ€ข Sources: {evaluation['source_diversity']} types\n"
666
+ eval_summary += f"โ€ข Completeness: {evaluation['completeness']}"
667
+
668
+ complete_response = final_answer + eval_summary
669
+
670
+ progress(0.95, desc="๐Ÿ”Š Generating voice response...")
671
+ audio_file = self.generate_audio_response(final_answer)
672
+
673
+ chat_history.append({"role": "assistant", "content": complete_response})
674
+
675
+ self.conversation_history.append({
676
+ 'timestamp': datetime.now().isoformat(),
677
+ 'query': query,
678
+ 'response': complete_response,
679
+ 'plan': plan,
680
+ 'results': results,
681
+ 'evaluation': evaluation,
682
+ 'audio_file': audio_file
683
+ })
684
+
685
+ progress(1.0, desc="โœ… Complete!")
686
+ return chat_history, "", audio_file
687
+
688
+ except Exception as e:
689
+ error = f"โŒ Error: {str(e)}"
690
+ chat_history.append({"role": "assistant", "content": error})
691
+ return chat_history, "", None
692
+
693
+ def upload_documents(self, files, progress=gr.Progress()):
694
+ if not files:
695
+ return "No files uploaded"
696
+
697
+ try:
698
+ progress(0.1, desc="Processing files...")
699
+ os.makedirs("sample_data", exist_ok=True)
700
+
701
+ uploaded = []
702
+ for file in files:
703
+ if hasattr(file, 'name') and file.name.endswith('.pdf'):
704
+ original = os.path.basename(file.name)
705
+ dest = os.path.join("sample_data", original)
706
+ with open(file.name, 'rb') as src:
707
+ with open(dest, 'wb') as dst:
708
+ dst.write(src.read())
709
+ uploaded.append(original)
710
+
711
+ if not uploaded:
712
+ return "โŒ No valid PDF files"
713
+
714
+ progress(0.5, desc="Generating embeddings...")
715
+ embeddings_data = build_embeddings_from_directory("sample_data", "temp_embeddings")
716
+
717
+ if embeddings_data and 'embeddings' in embeddings_data:
718
+ progress(0.8, desc="Building index...")
719
+ self.retriever = DocumentRetriever()
720
+ self.retriever.build_index(embeddings_data['chunks'], embeddings_data['embeddings'])
721
+
722
+ doc_count = embeddings_data.get('metadata', {}).get('num_documents', 0)
723
+ chunk_count = embeddings_data.get('metadata', {}).get('num_chunks', 0)
724
+
725
+ progress(1.0, desc="Complete!")
726
+ return f"""โœ… **Success!**
727
+
728
+ ๐Ÿ“„ Files: {', '.join(uploaded)}
729
+ ๐Ÿ“Š Documents: {doc_count} | Chunks: {chunk_count}
730
+
731
+ ๐ŸŽฏ Ready for complex questions with voice support!"""
732
+ else:
733
+ return "โŒ Failed to process documents"
734
+ except Exception as e:
735
+ return f"โŒ Error: {str(e)}"
736
+
737
+ def update_settings(self, temp, tokens, chunk_size, overlap, k, web, calc, fact, analysis):
738
+ self.temperature = temp
739
+ self.max_tokens = tokens
740
+ self.chunk_size = chunk_size
741
+ self.chunk_overlap = overlap
742
+ self.retrieval_k = k
743
+ self.enable_web_search = web
744
+ self.enable_calculations = calc
745
+ self.enable_fact_checking = fact
746
+ self.enable_analysis = analysis
747
+
748
+ return f"""โš™๏ธ Settings Updated:
749
+ โ€ข Temperature: {temp}
750
+ โ€ข Max Tokens: {tokens}
751
+ โ€ข Chunk Size: {chunk_size}
752
+ โ€ข Retrieved: {k}
753
+ โ€ข Web: {'โœ…' if web else 'โŒ'}
754
+ โ€ข Calc: {'โœ…' if calc else 'โŒ'}
755
+ โ€ข Voice Output: {'โœ…' if GTTS_AVAILABLE else 'โŒ'}"""
756
+
757
+
758
+ # ===================================================================
759
+ # GRADIO INTERFACE
760
+ # ===================================================================
761
+
762
+ def create_interface():
763
+ agent = AgenticRAGAgent()
764
+
765
+ with gr.Blocks(title="๐Ÿค– AI Research Agent", theme=gr.themes.Soft()) as interface:
766
+ gr.HTML("""
767
+ <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px;">
768
+ <h1 style="color: white; margin: 0;">๐Ÿค– AI Research Agent - Agentic RAG</h1>
769
+ <p style="color: white; margin: 10px 0;">Advanced Multi-Tool Research Assistant with Voice Support ๐Ÿ”Š</p>
770
+ </div>
771
+ """)
772
+
773
+ with gr.Row():
774
+ with gr.Column(scale=2):
775
+ chatbot = gr.Chatbot(label="๐Ÿ’ฌ Chat", height=500, type="messages")
776
+
777
+ with gr.Row():
778
+ msg = gr.Textbox(label="", placeholder="Ask a complex research question...", scale=4)
779
+ submit_btn = gr.Button("๐Ÿš€ Send", variant="primary", scale=1)
780
+
781
+ with gr.Row():
782
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Chat", variant="secondary")
783
+
784
+ # Audio Output
785
+ audio_output = gr.Audio(
786
+ label="๐Ÿ”Š Voice Response",
787
+ autoplay=True,
788
+ interactive=False
789
+ )
790
+
791
+ with gr.Column(scale=1):
792
+ with gr.Group():
793
+ gr.HTML("<h3 style='text-align: center;'>๐Ÿ“„ Upload Documents</h3>")
794
+ file_upload = gr.Files(label="", file_types=[".pdf"], file_count="multiple")
795
+ upload_status = gr.Textbox(label="๐Ÿ“Š Status", interactive=False, max_lines=10)
796
+
797
+ with gr.Accordion("โš™๏ธ Settings", open=False):
798
+ gr.HTML("<h4>๐Ÿง  AI Parameters</h4>")
799
+ temperature_slider = gr.Slider(0.0, 1.0, value=0.3, step=0.1, label="๐ŸŒก๏ธ Temperature")
800
+ max_tokens_slider = gr.Slider(100, 1000, value=500, step=50, label="๐Ÿ“ Max Tokens")
801
+
802
+ gr.HTML("<h4>๐Ÿ“„ Document Processing</h4>")
803
+ chunk_size_slider = gr.Slider(256, 1024, value=512, step=64, label="๐Ÿ“„ Chunk Size")
804
+ chunk_overlap_slider = gr.Slider(0, 100, value=50, step=10, label="๐Ÿ”— Overlap")
805
+ retrieval_k_slider = gr.Slider(3, 15, value=8, step=1, label="๐Ÿ” Retrieved Chunks")
806
+
807
+ gr.HTML("<h4>๐Ÿ› ๏ธ Agentic Tools</h4>")
808
+ with gr.Row():
809
+ enable_web = gr.Checkbox(value=True, label="๐ŸŒ Web Search")
810
+ enable_calc = gr.Checkbox(value=True, label="๐Ÿงฎ Calculator")
811
+ with gr.Row():
812
+ enable_fact = gr.Checkbox(value=True, label="โœ… Fact Check")
813
+ enable_analysis = gr.Checkbox(value=True, label="๐Ÿ“Š Analysis")
814
+
815
+ apply_btn = gr.Button("โšก Apply Settings", variant="primary", size="lg")
816
+
817
+ settings_status = gr.Textbox(label="โš™๏ธ Settings Status", interactive=False, max_lines=8)
818
+
819
+ with gr.Accordion("๐Ÿ”Š Voice Features Status", open=False):
820
+ gr.HTML(f"""
821
+ <div style="padding: 10px;">
822
+ <p><strong>Text-to-Speech (gTTS):</strong> {'โœ… Available' if GTTS_AVAILABLE else 'โŒ Not Available'}</p>
823
+ <p><strong>Speech-to-Text:</strong> {'โœ… Available' if STT_AVAILABLE else 'โŒ Not Available (HF Spaces limitation)'}</p>
824
+ <p><em>Voice output: Auto-plays with responses</em></p>
825
+ </div>
826
+ """)
827
+
828
+ # Event Handlers
829
+ def process_msg(message, history):
830
+ return asyncio.run(agent.process_agentic_query(message, history))
831
+
832
+ submit_btn.click(process_msg, inputs=[msg, chatbot], outputs=[chatbot, msg, audio_output])
833
+ msg.submit(process_msg, inputs=[msg, chatbot], outputs=[chatbot, msg, audio_output])
834
+ clear_btn.click(lambda: [], outputs=[chatbot])
835
+
836
+ file_upload.change(
837
+ agent.upload_documents,
838
+ inputs=[file_upload],
839
+ outputs=[upload_status]
840
+ )
841
+
842
+ apply_btn.click(
843
+ agent.update_settings,
844
+ inputs=[
845
+ temperature_slider, max_tokens_slider, chunk_size_slider,
846
+ chunk_overlap_slider, retrieval_k_slider, enable_web,
847
+ enable_calc, enable_fact, enable_analysis
848
+ ],
849
+ outputs=[settings_status]
850
+ )
851
+
852
+ return interface
853
+
854
+
855
+ # ===================================================================
856
+ # MAIN
857
+ # ===================================================================
858
+
859
+ if __name__ == "__main__":
860
+ print("๐Ÿš€ Launching AI Research Agent on Hugging Face Spaces...")
861
+ print("โœจ Features:")
862
+ print(" โ€ข Multi-Tool Integration")
863
+ print(" โ€ข Intelligent Query Planning")
864
+ print(" โ€ข Multi-Step Reasoning")
865
+ print(" โ€ข Result Synthesis")
866
+ print(" โ€ข Quality Evaluation")
867
+ print(" โ€ข ๐Ÿ”Š Voice Output (Text-to-Speech)")
868
+
869
+ app = create_interface()
870
+ app.launch()