Nihal2000 commited on
Commit
5262791
·
verified ·
1 Parent(s): 672ea87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -15
app.py CHANGED
@@ -3,6 +3,9 @@ import torch
3
  import sys
4
  import os
5
  import re
 
 
 
6
  from pathlib import Path
7
 
8
  # Add the project root to Python path
@@ -11,12 +14,58 @@ sys.path.append(str(project_root))
11
 
12
  from src.inference.inference import tokenizer, model # Import from your inference.py
13
  from src.vector_db.manager import ChromaVectorDBManager
 
14
  import logging
15
 
16
  # Setup logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Initialize Vector DB Manager
21
  try:
22
  logger.info("Initializing ChromaDB manager")
@@ -92,21 +141,30 @@ QUESTION:
92
 
93
  ANSWER:"""
94
 
95
- # Use inference setup
 
 
 
96
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
97
 
98
- # Generate response with conservative parameters for gemma-3-270m
99
- with torch.no_grad():
100
- outputs = model.generate(
101
- **inputs,
102
- max_new_tokens=256,
103
- do_sample=True,
104
- temperature=0.7,
105
- top_p=0.9,
106
- repetition_penalty=1.1,
107
- pad_token_id=tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id,
108
- eos_token_id=tokenizer.eos_token_id
109
- )
 
 
 
 
 
 
110
 
111
  # Decode and clean response
112
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -114,16 +172,50 @@ ANSWER:"""
114
  # Extract only the generated part (remove the original prompt)
115
  answer = full_response[len(prompt):].strip()
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  if not answer:
118
  answer = "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
119
 
120
  logger.info(f"Generated response length: {len(answer)} characters")
121
 
 
 
 
 
 
 
 
 
122
  # Return answer and sources if requested
123
  if show_context:
124
- return answer, f"**Sources Used:**\n{sources}\n\n**Context:**\n{context}"
125
  else:
126
- return answer, f"**Sources Used:**\n{sources}"
127
 
128
  except Exception as e:
129
  logger.error(f"Error in chat_with_rag: {e}")
 
3
  import sys
4
  import os
5
  import re
6
+ import json
7
+ import time
8
+ from datetime import datetime
9
  from pathlib import Path
10
 
11
  # Add the project root to Python path
 
14
 
15
  from src.inference.inference import tokenizer, model # Import from your inference.py
16
  from src.vector_db.manager import ChromaVectorDBManager
17
+ from src.utils.performance import PerformanceMonitor
18
  import logging
19
 
20
  # Setup logging
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
23
 
24
+ # Performance history file
25
+ PERFORMANCE_HISTORY_FILE = Path("performance_history.json")
26
+
27
+ def save_performance_metrics(metrics_data):
28
+ """Save performance metrics to history file"""
29
+ try:
30
+ if PERFORMANCE_HISTORY_FILE.exists():
31
+ with open(PERFORMANCE_HISTORY_FILE, 'r') as f:
32
+ history = json.load(f)
33
+ else:
34
+ history = []
35
+
36
+ history.append(metrics_data)
37
+
38
+ with open(PERFORMANCE_HISTORY_FILE, 'w') as f:
39
+ json.dump(history, f, indent=2)
40
+
41
+ except Exception as e:
42
+ logger.error(f"Failed to save performance metrics: {e}")
43
+
44
+ def calculate_performance_metrics(start_time, end_time, prompt_tokens, generated_tokens, peak_memory_mb):
45
+ """Calculate performance metrics similar to the requested format"""
46
+ inference_time = end_time - start_time
47
+ total_tokens = prompt_tokens + generated_tokens
48
+
49
+ # Calculate throughput (tokens per second)
50
+ throughput = total_tokens / inference_time if inference_time > 0 else 0
51
+
52
+ # Calculate inference latency (time per token in milliseconds)
53
+ latency_ms = (inference_time * 1000) / total_tokens if total_tokens > 0 else 0
54
+
55
+ return {
56
+ "timestamp": datetime.now().isoformat(),
57
+ "model": "Gemma-3-270M",
58
+ "load_time_s": "N/A", # Model is already loaded
59
+ "inference_latency_ms": round(latency_ms, 2),
60
+ "throughput_tokens_s": round(throughput, 2),
61
+ "ram_usage_mb": round(peak_memory_mb, 2),
62
+ "vram_usage_mb": 0, # CPU-only model
63
+ "energy_j": "N/A", # Would require specialized monitoring
64
+ "prompt_tokens": prompt_tokens,
65
+ "generated_tokens": generated_tokens,
66
+ "total_inference_time_s": round(inference_time, 3)
67
+ }
68
+
69
  # Initialize Vector DB Manager
70
  try:
71
  logger.info("Initializing ChromaDB manager")
 
141
 
142
  ANSWER:"""
143
 
144
+ # Count prompt tokens
145
+ prompt_tokens = len(tokenizer.encode(prompt))
146
+
147
+ # Use inference setup with performance monitoring
148
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
149
 
150
+ # Start performance monitoring for inference
151
+ with PerformanceMonitor("Model_Inference") as monitor:
152
+ start_time = time.time()
153
+
154
+ # Generate response with conservative parameters for gemma-3-270m
155
+ with torch.no_grad():
156
+ outputs = model.generate(
157
+ **inputs,
158
+ max_new_tokens=256,
159
+ do_sample=True,
160
+ temperature=0.7,
161
+ top_p=0.9,
162
+ repetition_penalty=1.1,
163
+ pad_token_id=tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id,
164
+ eos_token_id=tokenizer.eos_token_id
165
+ )
166
+
167
+ end_time = time.time()
168
 
169
  # Decode and clean response
170
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
172
  # Extract only the generated part (remove the original prompt)
173
  answer = full_response[len(prompt):].strip()
174
 
175
+ # Count generated tokens
176
+ generated_tokens = len(tokenizer.encode(answer))
177
+
178
+ # Get performance metrics from monitor
179
+ perf_metrics = monitor.stop_monitoring()
180
+
181
+ # Calculate and save performance metrics
182
+ metrics_data = calculate_performance_metrics(
183
+ start_time,
184
+ end_time,
185
+ prompt_tokens,
186
+ generated_tokens,
187
+ perf_metrics.peak_memory
188
+ )
189
+
190
+ # Save to history
191
+ save_performance_metrics(metrics_data)
192
+
193
+ # Log performance summary
194
+ logger.info(f"Performance Metrics:")
195
+ logger.info(f" Model: {metrics_data['model']}")
196
+ logger.info(f" Inference Latency: {metrics_data['inference_latency_ms']} ms")
197
+ logger.info(f" Throughput: {metrics_data['throughput_tokens_s']} tokens/s")
198
+ logger.info(f" RAM Usage: {metrics_data['ram_usage_mb']} MB")
199
+ logger.info(f" Tokens (prompt/generated): {metrics_data['prompt_tokens']}/{metrics_data['generated_tokens']}")
200
+
201
  if not answer:
202
  answer = "I apologize, but I couldn't generate a proper response. Please try rephrasing your question."
203
 
204
  logger.info(f"Generated response length: {len(answer)} characters")
205
 
206
+ # Add performance info to sources
207
+ perf_info = f"\n\n**Performance Metrics:**\n" \
208
+ f"- Model: {metrics_data['model']}\n" \
209
+ f"- Inference Latency: {metrics_data['inference_latency_ms']} ms\n" \
210
+ f"- Throughput: {metrics_data['throughput_tokens_s']} tokens/s\n" \
211
+ f"- RAM Usage: {metrics_data['ram_usage_mb']} MB\n" \
212
+ f"- Total Inference Time: {metrics_data['total_inference_time_s']} s"
213
+
214
  # Return answer and sources if requested
215
  if show_context:
216
+ return answer, f"**Sources Used:**\n{sources}\n\n**Context:**\n{context}{perf_info}"
217
  else:
218
+ return answer, f"**Sources Used:**\n{sources}{perf_info}"
219
 
220
  except Exception as e:
221
  logger.error(f"Error in chat_with_rag: {e}")