mlbench123 commited on
Commit
d3b9b16
·
verified ·
1 Parent(s): d2d3a42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +544 -67
app.py CHANGED
@@ -2,7 +2,6 @@
2
  FastAPI Service for Construction Scope Validation
3
  Deploy on Hugging Face Spaces
4
  """
5
-
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel, Field
@@ -30,19 +29,14 @@ app.add_middleware(
30
  )
31
 
32
  # Load embedding model (cached globally)
33
- # Try to load trained model from root, fallback to base model
34
  print("="*60)
35
  print("LOADING MODEL...")
36
  print("="*60)
37
-
38
  try:
39
- # Check if trained model files exist in root
40
- # Check if trained model files exist in root
41
  model_files = ['config.json', 'sentence_bert_config.json']
42
- # Check for either pytorch_model.bin or model.safetensors
43
  has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
44
  has_model = all(os.path.exists(f) for f in model_files) and has_weights
45
-
46
  if has_model:
47
  print("✓ Trained model files found in root directory")
48
  print("Loading trained model...")
@@ -57,11 +51,9 @@ except Exception as e:
57
  print("Falling back to base model...")
58
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
59
  print("✅ Base model loaded successfully!")
60
-
61
  print("="*60)
62
 
63
  # ============= DATA MODELS =============
64
-
65
  class LLMScopeItem(BaseModel):
66
  stage: str
67
  task: str
@@ -115,8 +107,34 @@ class ValidatedResponse(BaseModel):
115
  areas: List[ValidatedArea]
116
  summary: Dict[str, Any]
117
 
118
- # ============= DATABASE LOADERS =============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
120
  class DatabaseLoader:
121
  def __init__(self):
122
  self.stages = []
@@ -126,7 +144,7 @@ class DatabaseLoader:
126
  self.stage_embeddings = None
127
  self.task_embeddings = None
128
  self.material_embeddings = None
129
-
130
  def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
131
  """Load JSON data files"""
132
  print(f"Loading {stages_file}...")
@@ -147,7 +165,7 @@ class DatabaseLoader:
147
 
148
  print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
149
  f"{len(self.materials)} materials, {len(self.rooms)} rooms")
150
-
151
  def initialize_embeddings(self):
152
  """Pre-compute embeddings for fast lookup"""
153
  print("Computing stage embeddings...")
@@ -168,12 +186,10 @@ class DatabaseLoader:
168
  db = DatabaseLoader()
169
 
170
  # ============= MATCHING FUNCTIONS =============
171
-
172
  def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
173
  """Find closest matching stage from DB"""
174
  query_embedding = embedding_model.encode([llm_stage])
175
  similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
176
-
177
  best_idx = np.argmax(similarities)
178
  best_score = similarities[best_idx]
179
 
@@ -205,28 +221,21 @@ def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
205
 
206
  def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
207
  """Find relevant tasks for a stage matching LLM task description"""
208
- # Filter tasks by stage
209
  stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
210
-
211
  if not stage_tasks:
212
  return []
213
 
214
- # Compute similarities
215
  task_indices = [db.tasks.index(t) for t in stage_tasks]
216
  query_embedding = embedding_model.encode([llm_task])
217
-
218
  stage_task_embeddings = db.task_embeddings[task_indices]
219
  similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
220
 
221
- # Get top K
222
  top_indices = np.argsort(similarities)[-top_k:][::-1]
223
  results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
224
-
225
  return results
226
 
227
  def extract_keywords(text: str) -> List[str]:
228
  """Extract meaningful keywords from text"""
229
- # Remove common words
230
  stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
231
  words = re.findall(r'\b\w+\b', text.lower())
232
  return [w for w in words if w not in stop_words and len(w) > 2]
@@ -237,34 +246,27 @@ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int
237
  llm_keywords = extract_keywords(llm_material)
238
  all_keywords = set(task_keywords + llm_keywords)
239
 
240
- # Filter by unit compatibility
241
  compatible_materials = [
242
  m for m in db.materials
243
  if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
244
  ]
245
-
246
  if not compatible_materials:
247
- # Fallback: allow any unit
248
  compatible_materials = db.materials
249
 
250
- # Score materials
251
  scored_materials = []
252
  for material in compatible_materials:
253
  score = 0.0
254
  material_text = material['material'].lower()
255
 
256
- # Keyword matching
257
  for keyword in all_keywords:
258
  if keyword in material_text:
259
  score += 2.0
260
 
261
- # Category matching
262
  categories_str = ' '.join(material.get('categories', [])).lower()
263
  for keyword in all_keywords:
264
  if keyword in categories_str:
265
  score += 1.0
266
 
267
- # Embedding similarity
268
  material_idx = db.materials.index(material)
269
  query_embedding = embedding_model.encode([llm_material])
270
  material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
@@ -274,32 +276,25 @@ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int
274
  if score > 0:
275
  scored_materials.append((material, score))
276
 
277
- # Sort and return top K
278
  scored_materials.sort(key=lambda x: x[1], reverse=True)
279
  return scored_materials[:top_k]
280
 
281
  # ============= VALIDATION PIPELINE =============
282
-
283
  def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
284
  """Main validation pipeline"""
285
  validated_areas = []
286
 
287
  for area_scope in llm_scope.scope_of_work:
288
- # Match room/area
289
  matched_room, room_confidence = find_best_room(area_scope.area)
290
-
291
  validated_stages_dict = {}
292
 
293
  for item in area_scope.items:
294
- # Match stage
295
  matched_stage, stage_confidence = find_best_stage(item.stage)
296
-
297
  if not matched_stage:
298
- continue # Skip if stage not found
299
 
300
  stage_id = matched_stage['stageId']
301
 
302
- # Initialize stage if new
303
  if stage_id not in validated_stages_dict:
304
  validated_stages_dict[stage_id] = {
305
  'stage_data': matched_stage,
@@ -307,20 +302,14 @@ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
307
  'tasks': []
308
  }
309
 
310
- # Match task
311
  task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
312
-
313
  if not task_matches:
314
  continue
315
 
316
  best_task, task_confidence = task_matches[0]
317
 
318
- # Match materials
319
  material_matches = find_materials_for_task(
320
- best_task,
321
- item.material,
322
- item.unit,
323
- top_k=5
324
  )
325
 
326
  validated_materials = [
@@ -337,20 +326,20 @@ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
337
  for m, score in material_matches
338
  ]
339
 
 
340
  validated_task = ValidatedTask(
341
  taskId=best_task['taskId'],
342
  task=best_task['task'],
343
  displayName=best_task['displayName'],
344
  unit=best_task['unit'],
345
  stageId=best_task['stageId'],
346
- roomArea=best_task['roomArea'],
347
  confidence_score=round(task_confidence, 2),
348
  recommended_materials=validated_materials
349
  )
350
 
351
  validated_stages_dict[stage_id]['tasks'].append(validated_task)
352
 
353
- # Build validated stages list
354
  validated_stages = [
355
  ValidatedStage(
356
  stageId=stage_data['stage_data']['stageId'],
@@ -362,7 +351,6 @@ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
362
  for stage_data in validated_stages_dict.values()
363
  ]
364
 
365
- # Sort stages by priority
366
  validated_stages.sort(key=lambda x: x.priority)
367
 
368
  validated_area = ValidatedArea(
@@ -376,7 +364,6 @@ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
376
 
377
  validated_areas.append(validated_area)
378
 
379
- # Build summary
380
  summary = {
381
  'total_areas': len(validated_areas),
382
  'total_stages': sum(len(a.stages) for a in validated_areas),
@@ -396,7 +383,6 @@ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
396
  return ValidatedResponse(areas=validated_areas, summary=summary)
397
 
398
  # ============= API ENDPOINTS =============
399
-
400
  @app.get("/")
401
  async def root():
402
  return {
@@ -404,7 +390,7 @@ async def root():
404
  "version": "1.0.0",
405
  "status": "running",
406
  "data_loaded": len(db.stages) > 0,
407
- "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
408
  }
409
 
410
  @app.get("/health")
@@ -416,29 +402,24 @@ async def health():
416
  "materials_loaded": len(db.materials),
417
  "rooms_loaded": len(db.rooms),
418
  "embeddings_ready": db.stage_embeddings is not None,
419
- "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
420
  }
421
 
422
  @app.post("/validate", response_model=ValidatedResponse)
423
  async def validate_scope_endpoint(request: LLMScopeRequest):
424
  """
425
  Validate LLM-generated scope against database
426
-
427
- Returns enriched data with:
428
- - Matched stages from DB
429
- - Matched tasks from DB
430
- - Recommended materials with pricing
431
- - Confidence scores for all matches
432
  """
433
  try:
434
  if not db.stages:
435
  raise HTTPException(status_code=500, detail="Database not loaded")
436
-
437
  result = validate_scope(request)
438
  return result
439
-
440
  except Exception as e:
441
- raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
 
 
442
 
443
  @app.post("/match-stage")
444
  async def match_stage(stage_name: str):
@@ -465,7 +446,6 @@ async def match_room(room_name: str):
465
  return {"input": room_name, "matched": None, "confidence": 0.0}
466
 
467
  # ============= STARTUP =============
468
-
469
  @app.on_event("startup")
470
  async def startup_event():
471
  """Load data and initialize embeddings on startup"""
@@ -474,12 +454,6 @@ async def startup_event():
474
  print("STARTING UP...")
475
  print("="*60)
476
 
477
- # Check what files are available
478
- print("\nFiles in root directory:")
479
- for file in os.listdir('.'):
480
- print(f" - {file}")
481
-
482
- # Load data
483
  db.load_data(
484
  stages_file='stages.json',
485
  tasks_file='tasks.json',
@@ -499,4 +473,507 @@ async def startup_event():
499
 
500
  if __name__ == "__main__":
501
  import uvicorn
502
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  FastAPI Service for Construction Scope Validation
3
  Deploy on Hugging Face Spaces
4
  """
 
5
  from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel, Field
 
29
  )
30
 
31
  # Load embedding model (cached globally)
 
32
  print("="*60)
33
  print("LOADING MODEL...")
34
  print("="*60)
 
35
  try:
 
 
36
  model_files = ['config.json', 'sentence_bert_config.json']
 
37
  has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
38
  has_model = all(os.path.exists(f) for f in model_files) and has_weights
39
+
40
  if has_model:
41
  print("✓ Trained model files found in root directory")
42
  print("Loading trained model...")
 
51
  print("Falling back to base model...")
52
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
53
  print("✅ Base model loaded successfully!")
 
54
  print("="*60)
55
 
56
  # ============= DATA MODELS =============
 
57
  class LLMScopeItem(BaseModel):
58
  stage: str
59
  task: str
 
107
  areas: List[ValidatedArea]
108
  summary: Dict[str, Any]
109
 
110
+ # ============= HELPER FUNCTION =============
111
+ def parse_room_area(room_area_value):
112
+ """
113
+ Parse roomArea field which might be a string, list, or None
114
+ Returns a proper list of strings
115
+ """
116
+ if room_area_value is None:
117
+ return []
118
+
119
+ # If it's already a list, return it
120
+ if isinstance(room_area_value, list):
121
+ return room_area_value
122
+
123
+ # If it's a string, try to parse it as JSON
124
+ if isinstance(room_area_value, str):
125
+ try:
126
+ parsed = json.loads(room_area_value)
127
+ if isinstance(parsed, list):
128
+ return parsed
129
+ return [str(parsed)]
130
+ except json.JSONDecodeError:
131
+ # If JSON parsing fails, treat it as a single item
132
+ return [room_area_value]
133
+
134
+ # Fallback: convert to string and wrap in list
135
+ return [str(room_area_value)]
136
 
137
+ # ============= DATABASE LOADERS =============
138
  class DatabaseLoader:
139
  def __init__(self):
140
  self.stages = []
 
144
  self.stage_embeddings = None
145
  self.task_embeddings = None
146
  self.material_embeddings = None
147
+
148
  def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
149
  """Load JSON data files"""
150
  print(f"Loading {stages_file}...")
 
165
 
166
  print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
167
  f"{len(self.materials)} materials, {len(self.rooms)} rooms")
168
+
169
  def initialize_embeddings(self):
170
  """Pre-compute embeddings for fast lookup"""
171
  print("Computing stage embeddings...")
 
186
  db = DatabaseLoader()
187
 
188
  # ============= MATCHING FUNCTIONS =============
 
189
  def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
190
  """Find closest matching stage from DB"""
191
  query_embedding = embedding_model.encode([llm_stage])
192
  similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
 
193
  best_idx = np.argmax(similarities)
194
  best_score = similarities[best_idx]
195
 
 
221
 
222
  def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
223
  """Find relevant tasks for a stage matching LLM task description"""
 
224
  stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
 
225
  if not stage_tasks:
226
  return []
227
 
 
228
  task_indices = [db.tasks.index(t) for t in stage_tasks]
229
  query_embedding = embedding_model.encode([llm_task])
 
230
  stage_task_embeddings = db.task_embeddings[task_indices]
231
  similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
232
 
 
233
  top_indices = np.argsort(similarities)[-top_k:][::-1]
234
  results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
 
235
  return results
236
 
237
  def extract_keywords(text: str) -> List[str]:
238
  """Extract meaningful keywords from text"""
 
239
  stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
240
  words = re.findall(r'\b\w+\b', text.lower())
241
  return [w for w in words if w not in stop_words and len(w) > 2]
 
246
  llm_keywords = extract_keywords(llm_material)
247
  all_keywords = set(task_keywords + llm_keywords)
248
 
 
249
  compatible_materials = [
250
  m for m in db.materials
251
  if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
252
  ]
 
253
  if not compatible_materials:
 
254
  compatible_materials = db.materials
255
 
 
256
  scored_materials = []
257
  for material in compatible_materials:
258
  score = 0.0
259
  material_text = material['material'].lower()
260
 
 
261
  for keyword in all_keywords:
262
  if keyword in material_text:
263
  score += 2.0
264
 
 
265
  categories_str = ' '.join(material.get('categories', [])).lower()
266
  for keyword in all_keywords:
267
  if keyword in categories_str:
268
  score += 1.0
269
 
 
270
  material_idx = db.materials.index(material)
271
  query_embedding = embedding_model.encode([llm_material])
272
  material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
 
276
  if score > 0:
277
  scored_materials.append((material, score))
278
 
 
279
  scored_materials.sort(key=lambda x: x[1], reverse=True)
280
  return scored_materials[:top_k]
281
 
282
  # ============= VALIDATION PIPELINE =============
 
283
  def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
284
  """Main validation pipeline"""
285
  validated_areas = []
286
 
287
  for area_scope in llm_scope.scope_of_work:
 
288
  matched_room, room_confidence = find_best_room(area_scope.area)
 
289
  validated_stages_dict = {}
290
 
291
  for item in area_scope.items:
 
292
  matched_stage, stage_confidence = find_best_stage(item.stage)
 
293
  if not matched_stage:
294
+ continue
295
 
296
  stage_id = matched_stage['stageId']
297
 
 
298
  if stage_id not in validated_stages_dict:
299
  validated_stages_dict[stage_id] = {
300
  'stage_data': matched_stage,
 
302
  'tasks': []
303
  }
304
 
 
305
  task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
 
306
  if not task_matches:
307
  continue
308
 
309
  best_task, task_confidence = task_matches[0]
310
 
 
311
  material_matches = find_materials_for_task(
312
+ best_task, item.material, item.unit, top_k=5
 
 
 
313
  )
314
 
315
  validated_materials = [
 
326
  for m, score in material_matches
327
  ]
328
 
329
+ # FIX: Parse roomArea properly
330
  validated_task = ValidatedTask(
331
  taskId=best_task['taskId'],
332
  task=best_task['task'],
333
  displayName=best_task['displayName'],
334
  unit=best_task['unit'],
335
  stageId=best_task['stageId'],
336
+ roomArea=parse_room_area(best_task['roomArea']), # <-- FIXED HERE
337
  confidence_score=round(task_confidence, 2),
338
  recommended_materials=validated_materials
339
  )
340
 
341
  validated_stages_dict[stage_id]['tasks'].append(validated_task)
342
 
 
343
  validated_stages = [
344
  ValidatedStage(
345
  stageId=stage_data['stage_data']['stageId'],
 
351
  for stage_data in validated_stages_dict.values()
352
  ]
353
 
 
354
  validated_stages.sort(key=lambda x: x.priority)
355
 
356
  validated_area = ValidatedArea(
 
364
 
365
  validated_areas.append(validated_area)
366
 
 
367
  summary = {
368
  'total_areas': len(validated_areas),
369
  'total_stages': sum(len(a.stages) for a in validated_areas),
 
383
  return ValidatedResponse(areas=validated_areas, summary=summary)
384
 
385
  # ============= API ENDPOINTS =============
 
386
  @app.get("/")
387
  async def root():
388
  return {
 
390
  "version": "1.0.0",
391
  "status": "running",
392
  "data_loaded": len(db.stages) > 0,
393
+ "model_type": "trained" if os.path.exists('model.safetensors') else "base"
394
  }
395
 
396
  @app.get("/health")
 
402
  "materials_loaded": len(db.materials),
403
  "rooms_loaded": len(db.rooms),
404
  "embeddings_ready": db.stage_embeddings is not None,
405
+ "model_type": "trained" if os.path.exists('model.safetensors') else "base"
406
  }
407
 
408
  @app.post("/validate", response_model=ValidatedResponse)
409
  async def validate_scope_endpoint(request: LLMScopeRequest):
410
  """
411
  Validate LLM-generated scope against database
412
+ Returns enriched data with matched stages, tasks, materials, and confidence scores
 
 
 
 
 
413
  """
414
  try:
415
  if not db.stages:
416
  raise HTTPException(status_code=500, detail="Database not loaded")
 
417
  result = validate_scope(request)
418
  return result
 
419
  except Exception as e:
420
+ import traceback
421
+ error_detail = f"Validation error: {str(e)}\n{traceback.format_exc()}"
422
+ raise HTTPException(status_code=500, detail=error_detail)
423
 
424
  @app.post("/match-stage")
425
  async def match_stage(stage_name: str):
 
446
  return {"input": room_name, "matched": None, "confidence": 0.0}
447
 
448
  # ============= STARTUP =============
 
449
  @app.on_event("startup")
450
  async def startup_event():
451
  """Load data and initialize embeddings on startup"""
 
454
  print("STARTING UP...")
455
  print("="*60)
456
 
 
 
 
 
 
 
457
  db.load_data(
458
  stages_file='stages.json',
459
  tasks_file='tasks.json',
 
473
 
474
  if __name__ == "__main__":
475
  import uvicorn
476
+ uvicorn.run(app, host="0.0.0.0", port=7860)
477
+
478
+ # """
479
+ # FastAPI Service for Construction Scope Validation
480
+ # Deploy on Hugging Face Spaces
481
+ # """
482
+
483
+ # from fastapi import FastAPI, HTTPException
484
+ # from fastapi.middleware.cors import CORSMiddleware
485
+ # from pydantic import BaseModel, Field
486
+ # from typing import List, Optional, Dict, Any
487
+ # import json
488
+ # import numpy as np
489
+ # import os
490
+ # from sentence_transformers import SentenceTransformer
491
+ # from sklearn.metrics.pairwise import cosine_similarity
492
+ # import re
493
+
494
+ # app = FastAPI(
495
+ # title="Construction Scope Validator API",
496
+ # description="Validates and enriches LLM-generated construction scope with DB data",
497
+ # version="1.0.0"
498
+ # )
499
+
500
+ # # CORS middleware
501
+ # app.add_middleware(
502
+ # CORSMiddleware,
503
+ # allow_origins=["*"],
504
+ # allow_credentials=True,
505
+ # allow_methods=["*"],
506
+ # allow_headers=["*"],
507
+ # )
508
+
509
+ # # Load embedding model (cached globally)
510
+ # # Try to load trained model from root, fallback to base model
511
+ # print("="*60)
512
+ # print("LOADING MODEL...")
513
+ # print("="*60)
514
+
515
+ # try:
516
+ # # Check if trained model files exist in root
517
+ # # Check if trained model files exist in root
518
+ # model_files = ['config.json', 'sentence_bert_config.json']
519
+ # # Check for either pytorch_model.bin or model.safetensors
520
+ # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
521
+ # has_model = all(os.path.exists(f) for f in model_files) and has_weights
522
+
523
+ # if has_model:
524
+ # print("✓ Trained model files found in root directory")
525
+ # print("Loading trained model...")
526
+ # embedding_model = SentenceTransformer('./', device='cpu')
527
+ # print("✅ Trained model loaded successfully!")
528
+ # else:
529
+ # print("⚠️ Trained model not found, using base model...")
530
+ # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
531
+ # print("✅ Base model loaded successfully!")
532
+ # except Exception as e:
533
+ # print(f"❌ Error loading trained model: {e}")
534
+ # print("Falling back to base model...")
535
+ # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
536
+ # print("✅ Base model loaded successfully!")
537
+
538
+ # print("="*60)
539
+
540
+ # # ============= DATA MODELS =============
541
+
542
+ # class LLMScopeItem(BaseModel):
543
+ # stage: str
544
+ # task: str
545
+ # material: str
546
+ # quantity: float
547
+ # unit: str
548
+
549
+ # class LLMAreaScope(BaseModel):
550
+ # area: str
551
+ # items: List[LLMScopeItem]
552
+
553
+ # class LLMScopeRequest(BaseModel):
554
+ # scope_of_work: List[LLMAreaScope]
555
+
556
+ # class ValidatedMaterial(BaseModel):
557
+ # materialId: int
558
+ # name: str
559
+ # material: str
560
+ # unit: str
561
+ # price: float
562
+ # margin: float
563
+ # categories: List[str]
564
+ # confidence_score: float
565
+
566
+ # class ValidatedTask(BaseModel):
567
+ # taskId: int
568
+ # task: str
569
+ # displayName: str
570
+ # unit: str
571
+ # stageId: int
572
+ # roomArea: List[str]
573
+ # confidence_score: float
574
+ # recommended_materials: List[ValidatedMaterial]
575
+
576
+ # class ValidatedStage(BaseModel):
577
+ # stageId: int
578
+ # stage: str
579
+ # priority: int
580
+ # confidence_score: float
581
+ # tasks: List[ValidatedTask]
582
+
583
+ # class ValidatedArea(BaseModel):
584
+ # roomId: Optional[int]
585
+ # name: str
586
+ # roomType: str
587
+ # matched: bool
588
+ # confidence_score: float
589
+ # stages: List[ValidatedStage]
590
+
591
+ # class ValidatedResponse(BaseModel):
592
+ # areas: List[ValidatedArea]
593
+ # summary: Dict[str, Any]
594
+
595
+ # # ============= DATABASE LOADERS =============
596
+
597
+ # class DatabaseLoader:
598
+ # def __init__(self):
599
+ # self.stages = []
600
+ # self.tasks = []
601
+ # self.materials = []
602
+ # self.rooms = []
603
+ # self.stage_embeddings = None
604
+ # self.task_embeddings = None
605
+ # self.material_embeddings = None
606
+
607
+ # def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
608
+ # """Load JSON data files"""
609
+ # print(f"Loading {stages_file}...")
610
+ # with open(stages_file, 'r', encoding='utf-8') as f:
611
+ # self.stages = [json.loads(line) for line in f if line.strip()]
612
+
613
+ # print(f"Loading {tasks_file}...")
614
+ # with open(tasks_file, 'r', encoding='utf-8') as f:
615
+ # self.tasks = [json.loads(line) for line in f if line.strip()]
616
+
617
+ # print(f"Loading {materials_file}...")
618
+ # with open(materials_file, 'r', encoding='utf-8') as f:
619
+ # self.materials = [json.loads(line) for line in f if line.strip()]
620
+
621
+ # print(f"Loading {rooms_file}...")
622
+ # with open(rooms_file, 'r', encoding='utf-8') as f:
623
+ # self.rooms = [json.loads(line) for line in f if line.strip()]
624
+
625
+ # print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
626
+ # f"{len(self.materials)} materials, {len(self.rooms)} rooms")
627
+
628
+ # def initialize_embeddings(self):
629
+ # """Pre-compute embeddings for fast lookup"""
630
+ # print("Computing stage embeddings...")
631
+ # stage_texts = [s['stage'] for s in self.stages]
632
+ # self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
633
+
634
+ # print("Computing task embeddings...")
635
+ # task_texts = [t['task'] for t in self.tasks]
636
+ # self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
637
+
638
+ # print("Computing material embeddings...")
639
+ # material_texts = [m['material'] for m in self.materials]
640
+ # self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
641
+
642
+ # print("✅ Embeddings ready!")
643
+
644
+ # # Global DB instance
645
+ # db = DatabaseLoader()
646
+
647
+ # # ============= MATCHING FUNCTIONS =============
648
+
649
+ # def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
650
+ # """Find closest matching stage from DB"""
651
+ # query_embedding = embedding_model.encode([llm_stage])
652
+ # similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
653
+
654
+ # best_idx = np.argmax(similarities)
655
+ # best_score = similarities[best_idx]
656
+
657
+ # if best_score >= threshold:
658
+ # return db.stages[best_idx], best_score
659
+ # return None, 0.0
660
+
661
+ # def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
662
+ # """Find closest matching room from DB"""
663
+ # llm_area_lower = llm_area.lower()
664
+
665
+ # # Exact match first
666
+ # for room in db.rooms:
667
+ # if room['name'].lower() == llm_area_lower:
668
+ # return room, 1.0
669
+
670
+ # # Fuzzy match
671
+ # room_texts = [r['name'] for r in db.rooms]
672
+ # query_embedding = embedding_model.encode([llm_area])
673
+ # room_embeddings = embedding_model.encode(room_texts)
674
+ # similarities = cosine_similarity(query_embedding, room_embeddings)[0]
675
+
676
+ # best_idx = np.argmax(similarities)
677
+ # best_score = similarities[best_idx]
678
+
679
+ # if best_score >= threshold:
680
+ # return db.rooms[best_idx], best_score
681
+ # return None, 0.0
682
+
683
+ # def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
684
+ # """Find relevant tasks for a stage matching LLM task description"""
685
+ # # Filter tasks by stage
686
+ # stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
687
+
688
+ # if not stage_tasks:
689
+ # return []
690
+
691
+ # # Compute similarities
692
+ # task_indices = [db.tasks.index(t) for t in stage_tasks]
693
+ # query_embedding = embedding_model.encode([llm_task])
694
+
695
+ # stage_task_embeddings = db.task_embeddings[task_indices]
696
+ # similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
697
+
698
+ # # Get top K
699
+ # top_indices = np.argsort(similarities)[-top_k:][::-1]
700
+ # results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
701
+
702
+ # return results
703
+
704
+ # def extract_keywords(text: str) -> List[str]:
705
+ # """Extract meaningful keywords from text"""
706
+ # # Remove common words
707
+ # stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
708
+ # words = re.findall(r'\b\w+\b', text.lower())
709
+ # return [w for w in words if w not in stop_words and len(w) > 2]
710
+
711
+ # def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
712
+ # """Find materials matching task requirements"""
713
+ # task_keywords = extract_keywords(task['task'])
714
+ # llm_keywords = extract_keywords(llm_material)
715
+ # all_keywords = set(task_keywords + llm_keywords)
716
+
717
+ # # Filter by unit compatibility
718
+ # compatible_materials = [
719
+ # m for m in db.materials
720
+ # if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
721
+ # ]
722
+
723
+ # if not compatible_materials:
724
+ # # Fallback: allow any unit
725
+ # compatible_materials = db.materials
726
+
727
+ # # Score materials
728
+ # scored_materials = []
729
+ # for material in compatible_materials:
730
+ # score = 0.0
731
+ # material_text = material['material'].lower()
732
+
733
+ # # Keyword matching
734
+ # for keyword in all_keywords:
735
+ # if keyword in material_text:
736
+ # score += 2.0
737
+
738
+ # # Category matching
739
+ # categories_str = ' '.join(material.get('categories', [])).lower()
740
+ # for keyword in all_keywords:
741
+ # if keyword in categories_str:
742
+ # score += 1.0
743
+
744
+ # # Embedding similarity
745
+ # material_idx = db.materials.index(material)
746
+ # query_embedding = embedding_model.encode([llm_material])
747
+ # material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
748
+ # semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
749
+ # score += semantic_score * 5.0
750
+
751
+ # if score > 0:
752
+ # scored_materials.append((material, score))
753
+
754
+ # # Sort and return top K
755
+ # scored_materials.sort(key=lambda x: x[1], reverse=True)
756
+ # return scored_materials[:top_k]
757
+
758
+ # # ============= VALIDATION PIPELINE =============
759
+
760
+ # def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
761
+ # """Main validation pipeline"""
762
+ # validated_areas = []
763
+
764
+ # for area_scope in llm_scope.scope_of_work:
765
+ # # Match room/area
766
+ # matched_room, room_confidence = find_best_room(area_scope.area)
767
+
768
+ # validated_stages_dict = {}
769
+
770
+ # for item in area_scope.items:
771
+ # # Match stage
772
+ # matched_stage, stage_confidence = find_best_stage(item.stage)
773
+
774
+ # if not matched_stage:
775
+ # continue # Skip if stage not found
776
+
777
+ # stage_id = matched_stage['stageId']
778
+
779
+ # # Initialize stage if new
780
+ # if stage_id not in validated_stages_dict:
781
+ # validated_stages_dict[stage_id] = {
782
+ # 'stage_data': matched_stage,
783
+ # 'confidence': stage_confidence,
784
+ # 'tasks': []
785
+ # }
786
+
787
+ # # Match task
788
+ # task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
789
+
790
+ # if not task_matches:
791
+ # continue
792
+
793
+ # best_task, task_confidence = task_matches[0]
794
+
795
+ # # Match materials
796
+ # material_matches = find_materials_for_task(
797
+ # best_task,
798
+ # item.material,
799
+ # item.unit,
800
+ # top_k=5
801
+ # )
802
+
803
+ # validated_materials = [
804
+ # ValidatedMaterial(
805
+ # materialId=m['materialId'],
806
+ # name=m['name'],
807
+ # material=m['material'],
808
+ # unit=m['unit'] or 'unit',
809
+ # price=float(m['price']),
810
+ # margin=float(m['margin']),
811
+ # categories=m['categories'],
812
+ # confidence_score=round(score / 10.0, 2)
813
+ # )
814
+ # for m, score in material_matches
815
+ # ]
816
+
817
+ # validated_task = ValidatedTask(
818
+ # taskId=best_task['taskId'],
819
+ # task=best_task['task'],
820
+ # displayName=best_task['displayName'],
821
+ # unit=best_task['unit'],
822
+ # stageId=best_task['stageId'],
823
+ # roomArea=best_task['roomArea'],
824
+ # confidence_score=round(task_confidence, 2),
825
+ # recommended_materials=validated_materials
826
+ # )
827
+
828
+ # validated_stages_dict[stage_id]['tasks'].append(validated_task)
829
+
830
+ # # Build validated stages list
831
+ # validated_stages = [
832
+ # ValidatedStage(
833
+ # stageId=stage_data['stage_data']['stageId'],
834
+ # stage=stage_data['stage_data']['stage'],
835
+ # priority=stage_data['stage_data']['priority'],
836
+ # confidence_score=round(stage_data['confidence'], 2),
837
+ # tasks=stage_data['tasks']
838
+ # )
839
+ # for stage_data in validated_stages_dict.values()
840
+ # ]
841
+
842
+ # # Sort stages by priority
843
+ # validated_stages.sort(key=lambda x: x.priority)
844
+
845
+ # validated_area = ValidatedArea(
846
+ # roomId=matched_room['id'] if matched_room else None,
847
+ # name=matched_room['name'] if matched_room else area_scope.area,
848
+ # roomType=matched_room['roomType'] if matched_room else 'unknown',
849
+ # matched=matched_room is not None,
850
+ # confidence_score=round(room_confidence, 2),
851
+ # stages=validated_stages
852
+ # )
853
+
854
+ # validated_areas.append(validated_area)
855
+
856
+ # # Build summary
857
+ # summary = {
858
+ # 'total_areas': len(validated_areas),
859
+ # 'total_stages': sum(len(a.stages) for a in validated_areas),
860
+ # 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
861
+ # 'total_materials': sum(
862
+ # len(t.recommended_materials)
863
+ # for a in validated_areas
864
+ # for s in a.stages
865
+ # for t in s.tasks
866
+ # ),
867
+ # 'matched_areas': sum(1 for a in validated_areas if a.matched),
868
+ # 'avg_confidence': round(
869
+ # np.mean([a.confidence_score for a in validated_areas]), 2
870
+ # ) if validated_areas else 0.0
871
+ # }
872
+
873
+ # return ValidatedResponse(areas=validated_areas, summary=summary)
874
+
875
+ # # ============= API ENDPOINTS =============
876
+
877
+ # @app.get("/")
878
+ # async def root():
879
+ # return {
880
+ # "service": "Construction Scope Validator",
881
+ # "version": "1.0.0",
882
+ # "status": "running",
883
+ # "data_loaded": len(db.stages) > 0,
884
+ # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
885
+ # }
886
+
887
+ # @app.get("/health")
888
+ # async def health():
889
+ # return {
890
+ # "status": "healthy",
891
+ # "stages_loaded": len(db.stages),
892
+ # "tasks_loaded": len(db.tasks),
893
+ # "materials_loaded": len(db.materials),
894
+ # "rooms_loaded": len(db.rooms),
895
+ # "embeddings_ready": db.stage_embeddings is not None,
896
+ # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
897
+ # }
898
+
899
+ # @app.post("/validate", response_model=ValidatedResponse)
900
+ # async def validate_scope_endpoint(request: LLMScopeRequest):
901
+ # """
902
+ # Validate LLM-generated scope against database
903
+
904
+ # Returns enriched data with:
905
+ # - Matched stages from DB
906
+ # - Matched tasks from DB
907
+ # - Recommended materials with pricing
908
+ # - Confidence scores for all matches
909
+ # """
910
+ # try:
911
+ # if not db.stages:
912
+ # raise HTTPException(status_code=500, detail="Database not loaded")
913
+
914
+ # result = validate_scope(request)
915
+ # return result
916
+
917
+ # except Exception as e:
918
+ # raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
919
+
920
+ # @app.post("/match-stage")
921
+ # async def match_stage(stage_name: str):
922
+ # """Test endpoint: match a single stage name"""
923
+ # matched_stage, confidence = find_best_stage(stage_name)
924
+ # if matched_stage:
925
+ # return {
926
+ # "input": stage_name,
927
+ # "matched": matched_stage,
928
+ # "confidence": round(confidence, 2)
929
+ # }
930
+ # return {"input": stage_name, "matched": None, "confidence": 0.0}
931
+
932
+ # @app.post("/match-room")
933
+ # async def match_room(room_name: str):
934
+ # """Test endpoint: match a single room name"""
935
+ # matched_room, confidence = find_best_room(room_name)
936
+ # if matched_room:
937
+ # return {
938
+ # "input": room_name,
939
+ # "matched": matched_room,
940
+ # "confidence": round(confidence, 2)
941
+ # }
942
+ # return {"input": room_name, "matched": None, "confidence": 0.0}
943
+
944
+ # # ============= STARTUP =============
945
+
946
+ # @app.on_event("startup")
947
+ # async def startup_event():
948
+ # """Load data and initialize embeddings on startup"""
949
+ # try:
950
+ # print("\n" + "="*60)
951
+ # print("STARTING UP...")
952
+ # print("="*60)
953
+
954
+ # # Check what files are available
955
+ # print("\nFiles in root directory:")
956
+ # for file in os.listdir('.'):
957
+ # print(f" - {file}")
958
+
959
+ # # Load data
960
+ # db.load_data(
961
+ # stages_file='stages.json',
962
+ # tasks_file='tasks.json',
963
+ # materials_file='materials.json',
964
+ # rooms_file='rooms.json'
965
+ # )
966
+ # db.initialize_embeddings()
967
+
968
+ # print("\n" + "="*60)
969
+ # print("✅ SERVICE READY!")
970
+ # print("="*60)
971
+ # except Exception as e:
972
+ # print(f"\n❌ STARTUP ERROR: {e}")
973
+ # print("Make sure JSON files are in the correct location")
974
+ # import traceback
975
+ # traceback.print_exc()
976
+
977
+ # if __name__ == "__main__":
978
+ # import uvicorn
979
+ # uvicorn.run(app, host="0.0.0.0", port=7860)