mlbench123 commited on
Commit
ce5f14e
·
verified ·
1 Parent(s): 5915e23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1313 -727
app.py CHANGED
@@ -1,28 +1,30 @@
1
-
2
  """
3
- FastAPI Service for Construction Scope Validation
4
- Deploy on Hugging Face Spaces - Flattened File Structure
5
  """
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel, Field
9
- from typing import List, Optional, Dict, Any
10
  import json
11
  import numpy as np
12
  import os
13
- import shutil
14
  from sentence_transformers import SentenceTransformer
15
  from sklearn.metrics.pairwise import cosine_similarity
16
  import re
17
 
 
 
 
 
 
18
  app = FastAPI(
19
- title="Construction Scope Validator API",
20
- description="Validates and enriches LLM-generated construction scope with DB data",
21
- version="1.0.0"
22
  )
23
- #---------------------------
24
 
25
- # CORS middleware
26
  app.add_middleware(
27
  CORSMiddleware,
28
  allow_origins=["*"],
@@ -31,25 +33,17 @@ app.add_middleware(
31
  allow_headers=["*"],
32
  )
33
 
34
- # ============= MODEL LOADING WITH FLAT STRUCTURE =============
35
  print("="*60)
36
  print("LOADING MODEL...")
37
  print("="*60)
38
 
39
  def setup_model_structure():
40
- """
41
- Create temporary folder structure for sentence-transformers
42
- if files are in root (flattened structure)
43
- """
44
- # Check if we need to create structure
45
  if not os.path.exists('1_Pooling') or not os.path.exists('2_Normalize'):
46
  print("Creating temporary model structure...")
47
-
48
- # Create directories
49
  os.makedirs('1_Pooling', exist_ok=True)
50
  os.makedirs('2_Normalize', exist_ok=True)
51
 
52
- # Pooling config
53
  pooling_config = {
54
  "word_embedding_dimension": 384,
55
  "pooling_mode_cls_token": False,
@@ -60,100 +54,84 @@ def setup_model_structure():
60
  with open('1_Pooling/config.json', 'w') as f:
61
  json.dump(pooling_config, f, indent=2)
62
 
63
- # Normalize config (empty is fine)
64
  with open('2_Normalize/config.json', 'w') as f:
65
  json.dump({}, f)
66
 
67
- print("✓ Created 1_Pooling/config.json")
68
- print("✓ Created 2_Normalize/config.json")
69
 
70
- # Setup structure before loading model
71
  setup_model_structure()
72
 
 
 
 
 
73
  try:
74
  model_files = ['config.json', 'sentence_bert_config.json']
75
  has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
76
  has_model = all(os.path.exists(f) for f in model_files) and has_weights
77
 
78
  if has_model:
79
- print("✓ Model files found in root directory")
80
- print("Loading trained model...")
81
- embedding_model = SentenceTransformer('./', device='cpu')
82
- print("✅ Trained model loaded successfully!")
83
  else:
84
- print("⚠️ Model not found, using base model...")
85
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
86
- print("✅ Base model loaded successfully!")
87
  except Exception as e:
88
- print(f"❌ Error loading trained model: {e}")
89
- print("Falling back to base model...")
90
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
91
- print("✅ Base model loaded successfully!")
 
92
  print("="*60)
93
 
94
  # ============= DATA MODELS =============
95
- class LLMScopeItem(BaseModel):
96
  stage: str
97
  task: str
98
  material: str
99
  quantity: float
100
  unit: str
101
-
102
- class LLMAreaScope(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  area: str
104
- items: List[LLMScopeItem]
105
-
106
- class LLMScopeRequest(BaseModel):
107
- scope_of_work: List[LLMAreaScope]
 
 
108
 
109
- class ValidatedMaterial(BaseModel):
110
- materialId: int
111
- name: str
112
- material: str
113
- unit: str
114
- price: float
115
- margin: float
116
- categories: List[str]
117
- confidence_score: float
118
 
119
- class ValidatedTask(BaseModel):
120
- taskId: int
121
- task: str
122
- displayName: str
123
- unit: str
124
- stageId: int
125
- roomArea: List[str]
126
- confidence_score: float
127
- recommended_materials: List[ValidatedMaterial]
128
 
129
- class ValidatedStage(BaseModel):
130
- stageId: int
131
- stage: str
132
- priority: int
133
- confidence_score: float
134
- tasks: List[ValidatedTask]
135
-
136
- class ValidatedArea(BaseModel):
137
- roomId: Optional[int]
138
- name: str
139
- roomType: str
140
- matched: bool
141
- confidence_score: float
142
- stages: List[ValidatedStage]
143
-
144
- class ValidatedResponse(BaseModel):
145
- areas: List[ValidatedArea]
146
- summary: Dict[str, Any]
147
-
148
- # ============= HELPER FUNCTION =============
149
  def parse_room_area(room_area_value):
150
- """Parse roomArea field which might be a string, list, or None"""
151
  if room_area_value is None:
152
  return []
153
-
154
  if isinstance(room_area_value, list):
155
  return room_area_value
156
-
157
  if isinstance(room_area_value, str):
158
  try:
159
  parsed = json.loads(room_area_value)
@@ -162,10 +140,9 @@ def parse_room_area(room_area_value):
162
  return [str(parsed)]
163
  except json.JSONDecodeError:
164
  return [room_area_value]
165
-
166
  return [str(room_area_value)]
167
 
168
- # ============= DATABASE LOADERS =============
169
  class DatabaseLoader:
170
  def __init__(self):
171
  self.stages = []
@@ -177,7 +154,6 @@ class DatabaseLoader:
177
  self.material_embeddings = None
178
 
179
  def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
180
- """Load JSON data files"""
181
  print(f"Loading {stages_file}...")
182
  with open(stages_file, 'r', encoding='utf-8') as f:
183
  self.stages = [json.loads(line) for line in f if line.strip()]
@@ -198,28 +174,91 @@ class DatabaseLoader:
198
  f"{len(self.materials)} materials, {len(self.rooms)} rooms")
199
 
200
  def initialize_embeddings(self):
201
- """Pre-compute embeddings for fast lookup"""
202
- print("Computing stage embeddings...")
 
 
 
203
  stage_texts = [s['stage'] for s in self.stages]
204
- self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
 
 
 
 
 
 
205
 
206
- print("Computing task embeddings...")
207
  task_texts = [t['task'] for t in self.tasks]
208
- self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
 
 
 
 
 
 
209
 
210
- print("Computing material embeddings...")
211
  material_texts = [m['material'] for m in self.materials]
212
- self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
 
 
 
 
 
 
213
 
 
214
  print("✅ Embeddings ready!")
 
215
 
216
- # Global DB instance
217
  db = DatabaseLoader()
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # ============= MATCHING FUNCTIONS =============
220
  def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
221
- """Find closest matching stage from DB"""
222
- query_embedding = embedding_model.encode([llm_stage])
 
 
 
 
223
  similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
224
  best_idx = np.argmax(similarities)
225
  best_score = similarities[best_idx]
@@ -229,7 +268,6 @@ def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
229
  return None, 0.0
230
 
231
  def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
232
- """Find closest matching room from DB"""
233
  llm_area_lower = llm_area.lower()
234
 
235
  for room in db.rooms:
@@ -237,8 +275,18 @@ def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
237
  return room, 1.0
238
 
239
  room_texts = [r['name'] for r in db.rooms]
240
- query_embedding = embedding_model.encode([llm_area])
241
- room_embeddings = embedding_model.encode(room_texts)
 
 
 
 
 
 
 
 
 
 
242
  similarities = cosine_similarity(query_embedding, room_embeddings)[0]
243
 
244
  best_idx = np.argmax(similarities)
@@ -248,29 +296,80 @@ def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
248
  return db.rooms[best_idx], best_score
249
  return None, 0.0
250
 
251
- def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
252
- """Find relevant tasks for a stage matching LLM task description"""
 
 
 
 
 
 
 
 
 
253
  stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
254
- if not stage_tasks:
255
- return []
256
 
257
- task_indices = [db.tasks.index(t) for t in stage_tasks]
258
- query_embedding = embedding_model.encode([llm_task])
259
- stage_task_embeddings = db.task_embeddings[task_indices]
260
- similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- top_indices = np.argsort(similarities)[-top_k:][::-1]
263
- results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
264
- return results
265
 
266
  def extract_keywords(text: str) -> List[str]:
267
- """Extract meaningful keywords from text"""
268
  stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
269
  words = re.findall(r'\b\w+\b', text.lower())
270
  return [w for w in words if w not in stop_words and len(w) > 2]
271
 
272
- def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
273
- """Find materials matching task requirements"""
274
  task_keywords = extract_keywords(task['task'])
275
  llm_keywords = extract_keywords(llm_material)
276
  all_keywords = set(task_keywords + llm_keywords)
@@ -282,6 +381,13 @@ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int
282
  if not compatible_materials:
283
  compatible_materials = db.materials
284
 
 
 
 
 
 
 
 
285
  scored_materials = []
286
  for material in compatible_materials:
287
  score = 0.0
@@ -297,7 +403,6 @@ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int
297
  score += 1.0
298
 
299
  material_idx = db.materials.index(material)
300
- query_embedding = embedding_model.encode([llm_material])
301
  material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
302
  semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
303
  score += semantic_score * 5.0
@@ -305,120 +410,112 @@ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int
305
  if score > 0:
306
  scored_materials.append((material, score))
307
 
 
 
 
308
  scored_materials.sort(key=lambda x: x[1], reverse=True)
309
- return scored_materials[:top_k]
310
 
311
  # ============= VALIDATION PIPELINE =============
312
- def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
313
- """Main validation pipeline"""
314
- validated_areas = []
 
 
315
 
316
- for area_scope in llm_scope.scope_of_work:
317
  matched_room, room_confidence = find_best_room(area_scope.area)
318
- validated_stages_dict = {}
319
 
 
320
  for item in area_scope.items:
321
- matched_stage, stage_confidence = find_best_stage(item.stage)
322
- if not matched_stage:
323
- continue
324
-
325
- stage_id = matched_stage['stageId']
326
-
327
- if stage_id not in validated_stages_dict:
328
- validated_stages_dict[stage_id] = {
329
- 'stage_data': matched_stage,
330
- 'confidence': stage_confidence,
331
- 'tasks': []
332
- }
333
-
334
- task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
335
- if not task_matches:
336
- continue
337
-
338
- best_task, task_confidence = task_matches[0]
339
-
340
- material_matches = find_materials_for_task(
341
- best_task, item.material, item.unit, top_k=5
342
- )
343
 
344
- validated_materials = [
345
- ValidatedMaterial(
346
- materialId=m['materialId'],
347
- name=m['name'],
348
- material=m['material'],
349
- unit=m['unit'] or 'unit',
350
- price=float(m['price']),
351
- margin=float(m['margin']),
352
- categories=m['categories'],
353
- confidence_score=round(score / 10.0, 2)
 
 
 
 
354
  )
355
- for m, score in material_matches
356
- ]
357
-
358
- validated_task = ValidatedTask(
359
- taskId=best_task['taskId'],
360
- task=best_task['task'],
361
- displayName=best_task['displayName'],
362
- unit=best_task['unit'],
363
- stageId=best_task['stageId'],
364
- roomArea=parse_room_area(best_task['roomArea']),
365
- confidence_score=round(task_confidence, 2),
366
- recommended_materials=validated_materials
367
- )
 
 
 
 
 
 
 
 
 
 
 
368
 
369
- validated_stages_dict[stage_id]['tasks'].append(validated_task)
370
-
371
- validated_stages = [
372
- ValidatedStage(
373
- stageId=stage_data['stage_data']['stageId'],
374
- stage=stage_data['stage_data']['stage'],
375
- priority=stage_data['stage_data']['priority'],
376
- confidence_score=round(stage_data['confidence'], 2),
377
- tasks=stage_data['tasks']
378
- )
379
- for stage_data in validated_stages_dict.values()
380
- ]
381
-
382
- validated_stages.sort(key=lambda x: x.priority)
383
 
384
- validated_area = ValidatedArea(
 
 
385
  roomId=matched_room['id'] if matched_room else None,
386
- name=matched_room['name'] if matched_room else area_scope.area,
387
- roomType=matched_room['roomType'] if matched_room else 'unknown',
388
- matched=matched_room is not None,
389
- confidence_score=round(room_confidence, 2),
390
- stages=validated_stages
391
  )
392
-
393
- validated_areas.append(validated_area)
394
-
395
- summary = {
396
- 'total_areas': len(validated_areas),
397
- 'total_stages': sum(len(a.stages) for a in validated_areas),
398
- 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
399
- 'total_materials': sum(
400
- len(t.recommended_materials)
401
- for a in validated_areas
402
- for s in a.stages
403
- for t in s.tasks
404
- ),
405
- 'matched_areas': sum(1 for a in validated_areas if a.matched),
406
- 'avg_confidence': round(
407
- np.mean([a.confidence_score for a in validated_areas]), 2
408
- ) if validated_areas else 0.0
 
 
 
409
  }
410
 
411
- return ValidatedResponse(areas=validated_areas, summary=summary)
412
 
413
  # ============= API ENDPOINTS =============
414
  @app.get("/")
415
  async def root():
416
  return {
417
- "service": "Construction Scope Validator",
418
- "version": "1.0.0",
419
  "status": "running",
 
420
  "data_loaded": len(db.stages) > 0,
421
- "model_type": "trained" if os.path.exists('model.safetensors') else "base"
 
 
422
  }
423
 
424
  @app.get("/health")
@@ -429,13 +526,12 @@ async def health():
429
  "tasks_loaded": len(db.tasks),
430
  "materials_loaded": len(db.materials),
431
  "rooms_loaded": len(db.rooms),
432
- "embeddings_ready": db.stage_embeddings is not None,
433
- "model_type": "trained" if os.path.exists('model.safetensors') else "base"
434
  }
435
 
436
- @app.post("/validate", response_model=ValidatedResponse)
437
- async def validate_scope_endpoint(request: LLMScopeRequest):
438
- """Validate LLM-generated scope against database"""
439
  try:
440
  if not db.stages:
441
  raise HTTPException(status_code=500, detail="Database not loaded")
@@ -446,39 +542,30 @@ async def validate_scope_endpoint(request: LLMScopeRequest):
446
  error_detail = f"Validation error: {str(e)}\n{traceback.format_exc()}"
447
  raise HTTPException(status_code=500, detail=error_detail)
448
 
449
- @app.post("/match-stage")
450
- async def match_stage(stage_name: str):
451
- """Test endpoint: match a single stage name"""
452
- matched_stage, confidence = find_best_stage(stage_name)
453
- if matched_stage:
454
- return {
455
- "input": stage_name,
456
- "matched": matched_stage,
457
- "confidence": round(confidence, 2)
458
- }
459
- return {"input": stage_name, "matched": None, "confidence": 0.0}
460
-
461
- @app.post("/match-room")
462
- async def match_room(room_name: str):
463
- """Test endpoint: match a single room name"""
464
- matched_room, confidence = find_best_room(room_name)
465
- if matched_room:
466
- return {
467
- "input": room_name,
468
- "matched": matched_room,
469
- "confidence": round(confidence, 2)
470
- }
471
- return {"input": room_name, "matched": None, "confidence": 0.0}
472
 
473
  # ============= STARTUP =============
474
  @app.on_event("startup")
475
  async def startup_event():
476
- """Load data and initialize embeddings on startup"""
477
  try:
478
  print("\n" + "="*60)
479
- print("STARTING UP...")
480
  print("="*60)
481
 
 
 
 
482
  db.load_data(
483
  stages_file='stages.json',
484
  tasks_file='tasks.json',
@@ -487,8 +574,7 @@ async def startup_event():
487
  )
488
  db.initialize_embeddings()
489
 
490
- print("\n" + "="*60)
491
- print("✅ SERVICE READY!")
492
  print("="*60)
493
  except Exception as e:
494
  print(f"\n❌ STARTUP ERROR: {e}")
@@ -498,9 +584,10 @@ async def startup_event():
498
  if __name__ == "__main__":
499
  import uvicorn
500
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
501
  # """
502
  # FastAPI Service for Construction Scope Validation
503
- # Deploy on Hugging Face Spaces
504
  # """
505
  # from fastapi import FastAPI, HTTPException
506
  # from fastapi.middleware.cors import CORSMiddleware
@@ -509,6 +596,7 @@ if __name__ == "__main__":
509
  # import json
510
  # import numpy as np
511
  # import os
 
512
  # from sentence_transformers import SentenceTransformer
513
  # from sklearn.metrics.pairwise import cosine_similarity
514
  # import re
@@ -518,6 +606,7 @@ if __name__ == "__main__":
518
  # description="Validates and enriches LLM-generated construction scope with DB data",
519
  # version="1.0.0"
520
  # )
 
521
 
522
  # # CORS middleware
523
  # app.add_middleware(
@@ -528,22 +617,57 @@ if __name__ == "__main__":
528
  # allow_headers=["*"],
529
  # )
530
 
531
- # # Load embedding model (cached globally)
532
  # print("="*60)
533
  # print("LOADING MODEL...")
534
  # print("="*60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  # try:
536
  # model_files = ['config.json', 'sentence_bert_config.json']
537
  # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
538
  # has_model = all(os.path.exists(f) for f in model_files) and has_weights
539
 
540
  # if has_model:
541
- # print("✓ Trained model files found in root directory")
542
  # print("Loading trained model...")
543
  # embedding_model = SentenceTransformer('./', device='cpu')
544
  # print("✅ Trained model loaded successfully!")
545
  # else:
546
- # print("⚠️ Trained model not found, using base model...")
547
  # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
548
  # print("✅ Base model loaded successfully!")
549
  # except Exception as e:
@@ -609,18 +733,13 @@ if __name__ == "__main__":
609
 
610
  # # ============= HELPER FUNCTION =============
611
  # def parse_room_area(room_area_value):
612
- # """
613
- # Parse roomArea field which might be a string, list, or None
614
- # Returns a proper list of strings
615
- # """
616
  # if room_area_value is None:
617
  # return []
618
 
619
- # # If it's already a list, return it
620
  # if isinstance(room_area_value, list):
621
  # return room_area_value
622
 
623
- # # If it's a string, try to parse it as JSON
624
  # if isinstance(room_area_value, str):
625
  # try:
626
  # parsed = json.loads(room_area_value)
@@ -628,10 +747,8 @@ if __name__ == "__main__":
628
  # return parsed
629
  # return [str(parsed)]
630
  # except json.JSONDecodeError:
631
- # # If JSON parsing fails, treat it as a single item
632
  # return [room_area_value]
633
 
634
- # # Fallback: convert to string and wrap in list
635
  # return [str(room_area_value)]
636
 
637
  # # ============= DATABASE LOADERS =============
@@ -701,12 +818,10 @@ if __name__ == "__main__":
701
  # """Find closest matching room from DB"""
702
  # llm_area_lower = llm_area.lower()
703
 
704
- # # Exact match first
705
  # for room in db.rooms:
706
  # if room['name'].lower() == llm_area_lower:
707
  # return room, 1.0
708
 
709
- # # Fuzzy match
710
  # room_texts = [r['name'] for r in db.rooms]
711
  # query_embedding = embedding_model.encode([llm_area])
712
  # room_embeddings = embedding_model.encode(room_texts)
@@ -826,14 +941,13 @@ if __name__ == "__main__":
826
  # for m, score in material_matches
827
  # ]
828
 
829
- # # FIX: Parse roomArea properly
830
  # validated_task = ValidatedTask(
831
  # taskId=best_task['taskId'],
832
  # task=best_task['task'],
833
  # displayName=best_task['displayName'],
834
  # unit=best_task['unit'],
835
  # stageId=best_task['stageId'],
836
- # roomArea=parse_room_area(best_task['roomArea']), # <-- FIXED HERE
837
  # confidence_score=round(task_confidence, 2),
838
  # recommended_materials=validated_materials
839
  # )
@@ -907,10 +1021,7 @@ if __name__ == "__main__":
907
 
908
  # @app.post("/validate", response_model=ValidatedResponse)
909
  # async def validate_scope_endpoint(request: LLMScopeRequest):
910
- # """
911
- # Validate LLM-generated scope against database
912
- # Returns enriched data with matched stages, tasks, materials, and confidence scores
913
- # """
914
  # try:
915
  # if not db.stages:
916
  # raise HTTPException(status_code=500, detail="Database not loaded")
@@ -967,513 +1078,988 @@ if __name__ == "__main__":
967
  # print("="*60)
968
  # except Exception as e:
969
  # print(f"\n❌ STARTUP ERROR: {e}")
970
- # print("Make sure JSON files are in the correct location")
971
  # import traceback
972
  # traceback.print_exc()
973
 
974
  # if __name__ == "__main__":
975
  # import uvicorn
976
  # uvicorn.run(app, host="0.0.0.0", port=7860)
977
-
978
- # """
979
- # FastAPI Service for Construction Scope Validation
980
- # Deploy on Hugging Face Spaces
981
- # """
982
-
983
- # from fastapi import FastAPI, HTTPException
984
- # from fastapi.middleware.cors import CORSMiddleware
985
- # from pydantic import BaseModel, Field
986
- # from typing import List, Optional, Dict, Any
987
- # import json
988
- # import numpy as np
989
- # import os
990
- # from sentence_transformers import SentenceTransformer
991
- # from sklearn.metrics.pairwise import cosine_similarity
992
- # import re
993
-
994
- # app = FastAPI(
995
- # title="Construction Scope Validator API",
996
- # description="Validates and enriches LLM-generated construction scope with DB data",
997
- # version="1.0.0"
998
- # )
999
-
1000
- # # CORS middleware
1001
- # app.add_middleware(
1002
- # CORSMiddleware,
1003
- # allow_origins=["*"],
1004
- # allow_credentials=True,
1005
- # allow_methods=["*"],
1006
- # allow_headers=["*"],
1007
- # )
1008
-
1009
- # # Load embedding model (cached globally)
1010
- # # Try to load trained model from root, fallback to base model
1011
- # print("="*60)
1012
- # print("LOADING MODEL...")
1013
- # print("="*60)
1014
-
1015
- # try:
1016
- # # Check if trained model files exist in root
1017
- # # Check if trained model files exist in root
1018
- # model_files = ['config.json', 'sentence_bert_config.json']
1019
- # # Check for either pytorch_model.bin or model.safetensors
1020
- # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
1021
- # has_model = all(os.path.exists(f) for f in model_files) and has_weights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1022
 
1023
- # if has_model:
1024
- # print("✓ Trained model files found in root directory")
1025
- # print("Loading trained model...")
1026
- # embedding_model = SentenceTransformer('./', device='cpu')
1027
- # print(" Trained model loaded successfully!")
1028
- # else:
1029
- # print("⚠️ Trained model not found, using base model...")
1030
- # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1031
- # print(" Base model loaded successfully!")
1032
- # except Exception as e:
1033
- # print(f"❌ Error loading trained model: {e}")
1034
- # print("Falling back to base model...")
1035
- # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1036
- # print(" Base model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
 
1038
- # print("="*60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1039
 
1040
- # # ============= DATA MODELS =============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
 
1042
- # class LLMScopeItem(BaseModel):
1043
- # stage: str
1044
- # task: str
1045
- # material: str
1046
- # quantity: float
1047
- # unit: str
1048
 
1049
- # class LLMAreaScope(BaseModel):
1050
- # area: str
1051
- # items: List[LLMScopeItem]
1052
 
1053
- # class LLMScopeRequest(BaseModel):
1054
- # scope_of_work: List[LLMAreaScope]
 
 
 
 
 
 
 
 
 
1055
 
1056
- # class ValidatedMaterial(BaseModel):
1057
- # materialId: int
1058
- # name: str
1059
- # material: str
1060
- # unit: str
1061
- # price: float
1062
- # margin: float
1063
- # categories: List[str]
1064
- # confidence_score: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1065
 
1066
- # class ValidatedTask(BaseModel):
1067
- # taskId: int
1068
- # task: str
1069
- # displayName: str
1070
- # unit: str
1071
- # stageId: int
1072
- # roomArea: List[str]
1073
- # confidence_score: float
1074
- # recommended_materials: List[ValidatedMaterial]
1075
 
1076
- # class ValidatedStage(BaseModel):
1077
- # stageId: int
1078
- # stage: str
1079
- # priority: int
1080
- # confidence_score: float
1081
- # tasks: List[ValidatedTask]
1082
-
1083
- # class ValidatedArea(BaseModel):
1084
- # roomId: Optional[int]
1085
- # name: str
1086
- # roomType: str
1087
- # matched: bool
1088
- # confidence_score: float
1089
- # stages: List[ValidatedStage]
1090
-
1091
- # class ValidatedResponse(BaseModel):
1092
- # areas: List[ValidatedArea]
1093
- # summary: Dict[str, Any]
1094
-
1095
- # # ============= DATABASE LOADERS =============
1096
-
1097
- # class DatabaseLoader:
1098
- # def __init__(self):
1099
- # self.stages = []
1100
- # self.tasks = []
1101
- # self.materials = []
1102
- # self.rooms = []
1103
- # self.stage_embeddings = None
1104
- # self.task_embeddings = None
1105
- # self.material_embeddings = None
1106
-
1107
- # def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
1108
- # """Load JSON data files"""
1109
- # print(f"Loading {stages_file}...")
1110
- # with open(stages_file, 'r', encoding='utf-8') as f:
1111
- # self.stages = [json.loads(line) for line in f if line.strip()]
1112
-
1113
- # print(f"Loading {tasks_file}...")
1114
- # with open(tasks_file, 'r', encoding='utf-8') as f:
1115
- # self.tasks = [json.loads(line) for line in f if line.strip()]
1116
-
1117
- # print(f"Loading {materials_file}...")
1118
- # with open(materials_file, 'r', encoding='utf-8') as f:
1119
- # self.materials = [json.loads(line) for line in f if line.strip()]
1120
-
1121
- # print(f"Loading {rooms_file}...")
1122
- # with open(rooms_file, 'r', encoding='utf-8') as f:
1123
- # self.rooms = [json.loads(line) for line in f if line.strip()]
1124
-
1125
- # print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
1126
- # f"{len(self.materials)} materials, {len(self.rooms)} rooms")
1127
-
1128
- # def initialize_embeddings(self):
1129
- # """Pre-compute embeddings for fast lookup"""
1130
- # print("Computing stage embeddings...")
1131
- # stage_texts = [s['stage'] for s in self.stages]
1132
- # self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
1133
-
1134
- # print("Computing task embeddings...")
1135
- # task_texts = [t['task'] for t in self.tasks]
1136
- # self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
1137
-
1138
- # print("Computing material embeddings...")
1139
- # material_texts = [m['material'] for m in self.materials]
1140
- # self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
1141
-
1142
- # print("✅ Embeddings ready!")
1143
-
1144
- # # Global DB instance
1145
- # db = DatabaseLoader()
1146
-
1147
- # # ============= MATCHING FUNCTIONS =============
1148
-
1149
- # def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
1150
- # """Find closest matching stage from DB"""
1151
- # query_embedding = embedding_model.encode([llm_stage])
1152
- # similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
1153
-
1154
- # best_idx = np.argmax(similarities)
1155
- # best_score = similarities[best_idx]
1156
-
1157
- # if best_score >= threshold:
1158
- # return db.stages[best_idx], best_score
1159
- # return None, 0.0
1160
-
1161
- # def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
1162
- # """Find closest matching room from DB"""
1163
- # llm_area_lower = llm_area.lower()
1164
-
1165
- # # Exact match first
1166
- # for room in db.rooms:
1167
- # if room['name'].lower() == llm_area_lower:
1168
- # return room, 1.0
1169
-
1170
- # # Fuzzy match
1171
- # room_texts = [r['name'] for r in db.rooms]
1172
- # query_embedding = embedding_model.encode([llm_area])
1173
- # room_embeddings = embedding_model.encode(room_texts)
1174
- # similarities = cosine_similarity(query_embedding, room_embeddings)[0]
1175
-
1176
- # best_idx = np.argmax(similarities)
1177
- # best_score = similarities[best_idx]
1178
-
1179
- # if best_score >= threshold:
1180
- # return db.rooms[best_idx], best_score
1181
- # return None, 0.0
1182
-
1183
- # def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
1184
- # """Find relevant tasks for a stage matching LLM task description"""
1185
- # # Filter tasks by stage
1186
- # stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
1187
-
1188
- # if not stage_tasks:
1189
- # return []
1190
-
1191
- # # Compute similarities
1192
- # task_indices = [db.tasks.index(t) for t in stage_tasks]
1193
- # query_embedding = embedding_model.encode([llm_task])
1194
-
1195
- # stage_task_embeddings = db.task_embeddings[task_indices]
1196
- # similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
1197
-
1198
- # # Get top K
1199
- # top_indices = np.argsort(similarities)[-top_k:][::-1]
1200
- # results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
1201
-
1202
- # return results
1203
-
1204
- # def extract_keywords(text: str) -> List[str]:
1205
- # """Extract meaningful keywords from text"""
1206
- # # Remove common words
1207
- # stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
1208
- # words = re.findall(r'\b\w+\b', text.lower())
1209
- # return [w for w in words if w not in stop_words and len(w) > 2]
1210
-
1211
- # def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
1212
- # """Find materials matching task requirements"""
1213
- # task_keywords = extract_keywords(task['task'])
1214
- # llm_keywords = extract_keywords(llm_material)
1215
- # all_keywords = set(task_keywords + llm_keywords)
1216
-
1217
- # # Filter by unit compatibility
1218
- # compatible_materials = [
1219
- # m for m in db.materials
1220
- # if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
1221
- # ]
1222
-
1223
- # if not compatible_materials:
1224
- # # Fallback: allow any unit
1225
- # compatible_materials = db.materials
1226
-
1227
- # # Score materials
1228
- # scored_materials = []
1229
- # for material in compatible_materials:
1230
- # score = 0.0
1231
- # material_text = material['material'].lower()
1232
-
1233
- # # Keyword matching
1234
- # for keyword in all_keywords:
1235
- # if keyword in material_text:
1236
- # score += 2.0
1237
-
1238
- # # Category matching
1239
- # categories_str = ' '.join(material.get('categories', [])).lower()
1240
- # for keyword in all_keywords:
1241
- # if keyword in categories_str:
1242
- # score += 1.0
1243
-
1244
- # # Embedding similarity
1245
- # material_idx = db.materials.index(material)
1246
- # query_embedding = embedding_model.encode([llm_material])
1247
- # material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
1248
- # semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
1249
- # score += semantic_score * 5.0
1250
-
1251
- # if score > 0:
1252
- # scored_materials.append((material, score))
1253
-
1254
- # # Sort and return top K
1255
- # scored_materials.sort(key=lambda x: x[1], reverse=True)
1256
- # return scored_materials[:top_k]
1257
-
1258
- # # ============= VALIDATION PIPELINE =============
1259
-
1260
- # def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
1261
- # """Main validation pipeline"""
1262
- # validated_areas = []
1263
 
1264
- # for area_scope in llm_scope.scope_of_work:
1265
- # # Match room/area
1266
- # matched_room, room_confidence = find_best_room(area_scope.area)
1267
 
1268
- # validated_stages_dict = {}
1269
 
1270
- # for item in area_scope.items:
1271
- # # Match stage
1272
- # matched_stage, stage_confidence = find_best_stage(item.stage)
1273
 
1274
- # if not matched_stage:
1275
- # continue # Skip if stage not found
1276
 
1277
- # stage_id = matched_stage['stageId']
1278
 
1279
- # # Initialize stage if new
1280
- # if stage_id not in validated_stages_dict:
1281
- # validated_stages_dict[stage_id] = {
1282
- # 'stage_data': matched_stage,
1283
- # 'confidence': stage_confidence,
1284
- # 'tasks': []
1285
- # }
1286
 
1287
- # # Match task
1288
- # task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
1289
 
1290
- # if not task_matches:
1291
- # continue
1292
 
1293
- # best_task, task_confidence = task_matches[0]
1294
 
1295
- # # Match materials
1296
- # material_matches = find_materials_for_task(
1297
- # best_task,
1298
- # item.material,
1299
- # item.unit,
1300
- # top_k=5
1301
- # )
1302
 
1303
- # validated_materials = [
1304
- # ValidatedMaterial(
1305
- # materialId=m['materialId'],
1306
- # name=m['name'],
1307
- # material=m['material'],
1308
- # unit=m['unit'] or 'unit',
1309
- # price=float(m['price']),
1310
- # margin=float(m['margin']),
1311
- # categories=m['categories'],
1312
- # confidence_score=round(score / 10.0, 2)
1313
- # )
1314
- # for m, score in material_matches
1315
- # ]
1316
 
1317
- # validated_task = ValidatedTask(
1318
- # taskId=best_task['taskId'],
1319
- # task=best_task['task'],
1320
- # displayName=best_task['displayName'],
1321
- # unit=best_task['unit'],
1322
- # stageId=best_task['stageId'],
1323
- # roomArea=best_task['roomArea'],
1324
- # confidence_score=round(task_confidence, 2),
1325
- # recommended_materials=validated_materials
1326
- # )
1327
 
1328
- # validated_stages_dict[stage_id]['tasks'].append(validated_task)
1329
 
1330
- # # Build validated stages list
1331
- # validated_stages = [
1332
- # ValidatedStage(
1333
- # stageId=stage_data['stage_data']['stageId'],
1334
- # stage=stage_data['stage_data']['stage'],
1335
- # priority=stage_data['stage_data']['priority'],
1336
- # confidence_score=round(stage_data['confidence'], 2),
1337
- # tasks=stage_data['tasks']
1338
- # )
1339
- # for stage_data in validated_stages_dict.values()
1340
- # ]
1341
 
1342
- # # Sort stages by priority
1343
- # validated_stages.sort(key=lambda x: x.priority)
1344
 
1345
- # validated_area = ValidatedArea(
1346
- # roomId=matched_room['id'] if matched_room else None,
1347
- # name=matched_room['name'] if matched_room else area_scope.area,
1348
- # roomType=matched_room['roomType'] if matched_room else 'unknown',
1349
- # matched=matched_room is not None,
1350
- # confidence_score=round(room_confidence, 2),
1351
- # stages=validated_stages
1352
- # )
1353
 
1354
- # validated_areas.append(validated_area)
1355
 
1356
- # # Build summary
1357
- # summary = {
1358
- # 'total_areas': len(validated_areas),
1359
- # 'total_stages': sum(len(a.stages) for a in validated_areas),
1360
- # 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
1361
- # 'total_materials': sum(
1362
- # len(t.recommended_materials)
1363
- # for a in validated_areas
1364
- # for s in a.stages
1365
- # for t in s.tasks
1366
- # ),
1367
- # 'matched_areas': sum(1 for a in validated_areas if a.matched),
1368
- # 'avg_confidence': round(
1369
- # np.mean([a.confidence_score for a in validated_areas]), 2
1370
- # ) if validated_areas else 0.0
1371
- # }
1372
 
1373
- # return ValidatedResponse(areas=validated_areas, summary=summary)
1374
-
1375
- # # ============= API ENDPOINTS =============
1376
-
1377
- # @app.get("/")
1378
- # async def root():
1379
- # return {
1380
- # "service": "Construction Scope Validator",
1381
- # "version": "1.0.0",
1382
- # "status": "running",
1383
- # "data_loaded": len(db.stages) > 0,
1384
- # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
1385
- # }
1386
-
1387
- # @app.get("/health")
1388
- # async def health():
1389
- # return {
1390
- # "status": "healthy",
1391
- # "stages_loaded": len(db.stages),
1392
- # "tasks_loaded": len(db.tasks),
1393
- # "materials_loaded": len(db.materials),
1394
- # "rooms_loaded": len(db.rooms),
1395
- # "embeddings_ready": db.stage_embeddings is not None,
1396
- # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
1397
- # }
1398
-
1399
- # @app.post("/validate", response_model=ValidatedResponse)
1400
- # async def validate_scope_endpoint(request: LLMScopeRequest):
1401
- # """
1402
- # Validate LLM-generated scope against database
1403
 
1404
- # Returns enriched data with:
1405
- # - Matched stages from DB
1406
- # - Matched tasks from DB
1407
- # - Recommended materials with pricing
1408
- # - Confidence scores for all matches
1409
- # """
1410
- # try:
1411
- # if not db.stages:
1412
- # raise HTTPException(status_code=500, detail="Database not loaded")
1413
 
1414
- # result = validate_scope(request)
1415
- # return result
1416
 
1417
- # except Exception as e:
1418
- # raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
1419
-
1420
- # @app.post("/match-stage")
1421
- # async def match_stage(stage_name: str):
1422
- # """Test endpoint: match a single stage name"""
1423
- # matched_stage, confidence = find_best_stage(stage_name)
1424
- # if matched_stage:
1425
- # return {
1426
- # "input": stage_name,
1427
- # "matched": matched_stage,
1428
- # "confidence": round(confidence, 2)
1429
- # }
1430
- # return {"input": stage_name, "matched": None, "confidence": 0.0}
1431
-
1432
- # @app.post("/match-room")
1433
- # async def match_room(room_name: str):
1434
- # """Test endpoint: match a single room name"""
1435
- # matched_room, confidence = find_best_room(room_name)
1436
- # if matched_room:
1437
- # return {
1438
- # "input": room_name,
1439
- # "matched": matched_room,
1440
- # "confidence": round(confidence, 2)
1441
- # }
1442
- # return {"input": room_name, "matched": None, "confidence": 0.0}
1443
-
1444
- # # ============= STARTUP =============
1445
-
1446
- # @app.on_event("startup")
1447
- # async def startup_event():
1448
- # """Load data and initialize embeddings on startup"""
1449
- # try:
1450
- # print("\n" + "="*60)
1451
- # print("STARTING UP...")
1452
- # print("="*60)
1453
 
1454
- # # Check what files are available
1455
- # print("\nFiles in root directory:")
1456
- # for file in os.listdir('.'):
1457
- # print(f" - {file}")
1458
 
1459
- # # Load data
1460
- # db.load_data(
1461
- # stages_file='stages.json',
1462
- # tasks_file='tasks.json',
1463
- # materials_file='materials.json',
1464
- # rooms_file='rooms.json'
1465
- # )
1466
- # db.initialize_embeddings()
1467
 
1468
- # print("\n" + "="*60)
1469
- # print("✅ SERVICE READY!")
1470
- # print("="*60)
1471
- # except Exception as e:
1472
- # print(f"\n❌ STARTUP ERROR: {e}")
1473
- # print("Make sure JSON files are in the correct location")
1474
- # import traceback
1475
- # traceback.print_exc()
1476
-
1477
- # if __name__ == "__main__":
1478
- # import uvicorn
1479
- # uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
1
  """
2
+ FastAPI Service for Construction Scope Validation - FIXED VERSION
3
+ Includes semantic validation to prevent wrong tasks being assigned to stages
4
  """
5
  from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel, Field
8
+ from typing import List, Optional, Dict, Any, Tuple
9
  import json
10
  import numpy as np
11
  import os
12
+ import torch
13
  from sentence_transformers import SentenceTransformer
14
  from sklearn.metrics.pairwise import cosine_similarity
15
  import re
16
 
17
+ torch.backends.cudnn.benchmark = True
18
+ torch.backends.cuda.matmul.allow_tf32 = True
19
+ torch.set_float32_matmul_precision('high')
20
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
21
+
22
  app = FastAPI(
23
+ title="Construction Scope Validator API - Fixed",
24
+ description="Validates with semantic task-stage checking",
25
+ version="2.1.0"
26
  )
 
27
 
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"],
 
33
  allow_headers=["*"],
34
  )
35
 
36
+ # ============= MODEL LOADING =============
37
  print("="*60)
38
  print("LOADING MODEL...")
39
  print("="*60)
40
 
41
  def setup_model_structure():
 
 
 
 
 
42
  if not os.path.exists('1_Pooling') or not os.path.exists('2_Normalize'):
43
  print("Creating temporary model structure...")
 
 
44
  os.makedirs('1_Pooling', exist_ok=True)
45
  os.makedirs('2_Normalize', exist_ok=True)
46
 
 
47
  pooling_config = {
48
  "word_embedding_dimension": 384,
49
  "pooling_mode_cls_token": False,
 
54
  with open('1_Pooling/config.json', 'w') as f:
55
  json.dump(pooling_config, f, indent=2)
56
 
 
57
  with open('2_Normalize/config.json', 'w') as f:
58
  json.dump({}, f)
59
 
60
+ print("✓ Created model structure")
 
61
 
 
62
  setup_model_structure()
63
 
64
+ print(f"CUDA available: {torch.cuda.is_available()}")
65
+ if torch.cuda.is_available():
66
+ print(f"GPU device: {torch.cuda.get_device_name(0)}")
67
+
68
  try:
69
  model_files = ['config.json', 'sentence_bert_config.json']
70
  has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
71
  has_model = all(os.path.exists(f) for f in model_files) and has_weights
72
 
73
  if has_model:
74
+ print("✓ Loading trained model...")
75
+ embedding_model = SentenceTransformer('./', device='cuda')
76
+ print("✅ Trained model loaded!")
 
77
  else:
78
+ print("⚠️ Loading base model...")
79
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
80
+ print("✅ Base model loaded!")
81
  except Exception as e:
82
+ print(f"❌ Error: {e}")
83
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
84
+
85
+ BATCH_SIZE = 4096
86
+ print(f"✓ Batch Size: {BATCH_SIZE}")
87
  print("="*60)
88
 
89
  # ============= DATA MODELS =============
90
+ class ScopeItem(BaseModel):
91
  stage: str
92
  task: str
93
  material: str
94
  quantity: float
95
  unit: str
96
+
97
+ # Enrichment fields
98
+ stageId: Optional[int] = None
99
+ taskId: Optional[int] = None
100
+ materialId: Optional[int] = None
101
+ stage_confidence: Optional[float] = None
102
+ task_confidence: Optional[float] = None
103
+ material_confidence: Optional[float] = None
104
+ validated_stage: Optional[str] = None
105
+ validated_task: Optional[str] = None
106
+ validated_material: Optional[str] = None
107
+ material_price: Optional[float] = None
108
+ material_margin: Optional[float] = None
109
+ # NEW: Validation flags
110
+ task_semantic_valid: Optional[bool] = None
111
+ task_database_stageId: Optional[int] = None
112
+
113
+ class AreaScope(BaseModel):
114
  area: str
115
+ items: List[ScopeItem]
116
+
117
+ roomId: Optional[int] = None
118
+ roomType: Optional[str] = None
119
+ area_confidence: Optional[float] = None
120
+ validated_area: Optional[str] = None
121
 
122
+ class ScopeRequest(BaseModel):
123
+ scope_of_work: List[AreaScope]
 
 
 
 
 
 
 
124
 
125
+ class ScopeResponse(BaseModel):
126
+ scope_of_work: List[AreaScope]
127
+ metadata: Optional[Dict[str, Any]] = None
 
 
 
 
 
 
128
 
129
+ # ============= HELPER FUNCTIONS =============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def parse_room_area(room_area_value):
 
131
  if room_area_value is None:
132
  return []
 
133
  if isinstance(room_area_value, list):
134
  return room_area_value
 
135
  if isinstance(room_area_value, str):
136
  try:
137
  parsed = json.loads(room_area_value)
 
140
  return [str(parsed)]
141
  except json.JSONDecodeError:
142
  return [room_area_value]
 
143
  return [str(room_area_value)]
144
 
145
+ # ============= DATABASE LOADER =============
146
  class DatabaseLoader:
147
  def __init__(self):
148
  self.stages = []
 
154
  self.material_embeddings = None
155
 
156
  def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
 
157
  print(f"Loading {stages_file}...")
158
  with open(stages_file, 'r', encoding='utf-8') as f:
159
  self.stages = [json.loads(line) for line in f if line.strip()]
 
174
  f"{len(self.materials)} materials, {len(self.rooms)} rooms")
175
 
176
  def initialize_embeddings(self):
177
+ print("\n" + "="*60)
178
+ print("INITIALIZING EMBEDDINGS")
179
+ print("="*60)
180
+
181
+ print(f"Computing stage embeddings...")
182
  stage_texts = [s['stage'] for s in self.stages]
183
+ self.stage_embeddings = embedding_model.encode(
184
+ stage_texts,
185
+ batch_size=BATCH_SIZE,
186
+ show_progress_bar=True,
187
+ convert_to_numpy=True,
188
+ normalize_embeddings=True
189
+ )
190
 
191
+ print(f"Computing task embeddings...")
192
  task_texts = [t['task'] for t in self.tasks]
193
+ self.task_embeddings = embedding_model.encode(
194
+ task_texts,
195
+ batch_size=BATCH_SIZE,
196
+ show_progress_bar=True,
197
+ convert_to_numpy=True,
198
+ normalize_embeddings=True
199
+ )
200
 
201
+ print(f"Computing material embeddings...")
202
  material_texts = [m['material'] for m in self.materials]
203
+ self.material_embeddings = embedding_model.encode(
204
+ material_texts,
205
+ batch_size=BATCH_SIZE,
206
+ show_progress_bar=True,
207
+ convert_to_numpy=True,
208
+ normalize_embeddings=True
209
+ )
210
 
211
+ print("="*60)
212
  print("✅ Embeddings ready!")
213
+ print("="*60)
214
 
 
215
  db = DatabaseLoader()
216
 
217
+ # ============= SEMANTIC VALIDATOR =============
218
+ class SemanticValidator:
219
+ """Validates if tasks semantically belong to stages"""
220
+
221
+ def __init__(self):
222
+ pass
223
+
224
+ def validate_task_for_stage(self, task: dict, stage: dict,
225
+ task_confidence: float) -> Tuple[bool, float]:
226
+ """Check if task semantically belongs to stage"""
227
+ # Get embeddings
228
+ stage_idx = next((i for i, s in enumerate(db.stages)
229
+ if s['stageId'] == stage['stageId']), None)
230
+ task_idx = next((i for i, t in enumerate(db.tasks)
231
+ if t['taskId'] == task['taskId']), None)
232
+
233
+ if stage_idx is None or task_idx is None:
234
+ return False, 0.0
235
+
236
+ stage_emb = db.stage_embeddings[stage_idx].reshape(1, -1)
237
+ task_emb = db.task_embeddings[task_idx].reshape(1, -1)
238
+
239
+ semantic_similarity = cosine_similarity(stage_emb, task_emb)[0][0]
240
+
241
+ # Threshold for semantic belonging
242
+ SEMANTIC_THRESHOLD = 0.25 # Lowered for more lenient matching
243
+
244
+ if semantic_similarity < SEMANTIC_THRESHOLD:
245
+ return False, 0.0
246
+
247
+ # Adjust confidence
248
+ adjusted_confidence = task_confidence * min(semantic_similarity / 0.4, 1.0)
249
+
250
+ return True, adjusted_confidence
251
+
252
+ validator = SemanticValidator()
253
+
254
  # ============= MATCHING FUNCTIONS =============
255
  def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
256
+ query_embedding = embedding_model.encode(
257
+ [llm_stage],
258
+ batch_size=BATCH_SIZE,
259
+ convert_to_numpy=True,
260
+ normalize_embeddings=True
261
+ )
262
  similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
263
  best_idx = np.argmax(similarities)
264
  best_score = similarities[best_idx]
 
268
  return None, 0.0
269
 
270
  def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
 
271
  llm_area_lower = llm_area.lower()
272
 
273
  for room in db.rooms:
 
275
  return room, 1.0
276
 
277
  room_texts = [r['name'] for r in db.rooms]
278
+ query_embedding = embedding_model.encode(
279
+ [llm_area],
280
+ batch_size=BATCH_SIZE,
281
+ convert_to_numpy=True,
282
+ normalize_embeddings=True
283
+ )
284
+ room_embeddings = embedding_model.encode(
285
+ room_texts,
286
+ batch_size=BATCH_SIZE,
287
+ convert_to_numpy=True,
288
+ normalize_embeddings=True
289
+ )
290
  similarities = cosine_similarity(query_embedding, room_embeddings)[0]
291
 
292
  best_idx = np.argmax(similarities)
 
296
  return db.rooms[best_idx], best_score
297
  return None, 0.0
298
 
299
+ def find_best_task_with_semantic_validation(
300
+ stage_id: int,
301
+ llm_task: str,
302
+ stage: dict,
303
+ fallback_to_global: bool = True
304
+ ) -> Tuple[Optional[dict], float, bool, Optional[int]]:
305
+ """
306
+ Enhanced task matching with semantic validation
307
+ Returns: (task, confidence, is_semantically_valid, original_db_stageId)
308
+ """
309
+ # Try stage-specific tasks first
310
  stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
 
 
311
 
312
+ if stage_tasks:
313
+ task_indices = [db.tasks.index(t) for t in stage_tasks]
314
+ query_embedding = embedding_model.encode(
315
+ [llm_task],
316
+ batch_size=1,
317
+ convert_to_numpy=True,
318
+ normalize_embeddings=True
319
+ )
320
+ stage_task_embeddings = db.task_embeddings[task_indices]
321
+ similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
322
+
323
+ # Get top 3 candidates
324
+ top_indices = np.argsort(similarities)[-3:][::-1]
325
+
326
+ for idx in top_indices:
327
+ candidate_task = stage_tasks[idx]
328
+ candidate_confidence = similarities[idx]
329
+
330
+ # Validate semantically
331
+ is_valid, adjusted_confidence = validator.validate_task_for_stage(
332
+ candidate_task, stage, candidate_confidence
333
+ )
334
+
335
+ if is_valid and adjusted_confidence > 0.35:
336
+ return (candidate_task, adjusted_confidence, True,
337
+ candidate_task['stageId'])
338
+
339
+ # Fallback: Search ALL tasks
340
+ if fallback_to_global:
341
+ query_embedding = embedding_model.encode(
342
+ [llm_task],
343
+ batch_size=1,
344
+ convert_to_numpy=True,
345
+ normalize_embeddings=True
346
+ )
347
+
348
+ all_similarities = cosine_similarity(query_embedding, db.task_embeddings)[0]
349
+ top_global_indices = np.argsort(all_similarities)[-5:][::-1]
350
+
351
+ for idx in top_global_indices:
352
+ candidate_task = db.tasks[idx]
353
+ candidate_confidence = all_similarities[idx]
354
+
355
+ # Validate with our matched stage
356
+ is_valid, adjusted_confidence = validator.validate_task_for_stage(
357
+ candidate_task, stage, candidate_confidence
358
+ )
359
+
360
+ if is_valid and adjusted_confidence > 0.3:
361
+ return (candidate_task, adjusted_confidence, True,
362
+ candidate_task['stageId'])
363
 
364
+ return None, 0.0, False, None
 
 
365
 
366
  def extract_keywords(text: str) -> List[str]:
 
367
  stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
368
  words = re.findall(r'\b\w+\b', text.lower())
369
  return [w for w in words if w not in stop_words and len(w) > 2]
370
 
371
+ def find_best_material(task: dict, llm_material: str, unit: str) -> tuple:
372
+ """Find single best material for task"""
373
  task_keywords = extract_keywords(task['task'])
374
  llm_keywords = extract_keywords(llm_material)
375
  all_keywords = set(task_keywords + llm_keywords)
 
381
  if not compatible_materials:
382
  compatible_materials = db.materials
383
 
384
+ query_embedding = embedding_model.encode(
385
+ [llm_material],
386
+ batch_size=1,
387
+ convert_to_numpy=True,
388
+ normalize_embeddings=True
389
+ )
390
+
391
  scored_materials = []
392
  for material in compatible_materials:
393
  score = 0.0
 
403
  score += 1.0
404
 
405
  material_idx = db.materials.index(material)
 
406
  material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
407
  semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
408
  score += semantic_score * 5.0
 
410
  if score > 0:
411
  scored_materials.append((material, score))
412
 
413
+ if not scored_materials:
414
+ return None, 0.0
415
+
416
  scored_materials.sort(key=lambda x: x[1], reverse=True)
417
+ return scored_materials[0]
418
 
419
  # ============= VALIDATION PIPELINE =============
420
+ def validate_scope(request: ScopeRequest) -> ScopeResponse:
421
+ """Validate and enrich scope with semantic validation"""
422
+ enriched_areas = []
423
+
424
+ semantic_mismatches = 0
425
 
426
+ for area_scope in request.scope_of_work:
427
  matched_room, room_confidence = find_best_room(area_scope.area)
 
428
 
429
+ enriched_items = []
430
  for item in area_scope.items:
431
+ enriched_item = item.model_copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
+ # Match stage
434
+ matched_stage, stage_confidence = find_best_stage(item.stage)
435
+ if matched_stage:
436
+ enriched_item.stageId = matched_stage['stageId']
437
+ enriched_item.validated_stage = matched_stage['stage']
438
+ enriched_item.stage_confidence = round(stage_confidence, 2)
439
+
440
+ # Match task with semantic validation
441
+ (matched_task, task_confidence,
442
+ is_semantic_valid, db_stage_id) = find_best_task_with_semantic_validation(
443
+ matched_stage['stageId'],
444
+ item.task,
445
+ matched_stage,
446
+ fallback_to_global=True
447
  )
448
+
449
+ if matched_task:
450
+ enriched_item.taskId = matched_task['taskId']
451
+ enriched_item.validated_task = matched_task['task']
452
+ enriched_item.task_confidence = round(task_confidence, 2)
453
+ enriched_item.task_semantic_valid = is_semantic_valid
454
+ enriched_item.task_database_stageId = db_stage_id
455
+
456
+ if not is_semantic_valid:
457
+ semantic_mismatches += 1
458
+
459
+ # Match material
460
+ matched_material, material_score = find_best_material(
461
+ matched_task,
462
+ item.material,
463
+ item.unit
464
+ )
465
+ if matched_material:
466
+ enriched_item.materialId = matched_material['materialId']
467
+ enriched_item.validated_material = matched_material['material']
468
+ enriched_item.material_confidence = round(material_score / 10.0, 2)
469
+ enriched_item.material_price = float(matched_material['price'])
470
+ enriched_item.material_margin = float(matched_material['margin'])
471
+ enriched_item.material = matched_material['material']
472
 
473
+ enriched_items.append(enriched_item)
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
+ enriched_area = AreaScope(
476
+ area=area_scope.area,
477
+ items=enriched_items,
478
  roomId=matched_room['id'] if matched_room else None,
479
+ roomType=matched_room['roomType'] if matched_room else None,
480
+ validated_area=matched_room['name'] if matched_room else area_scope.area,
481
+ area_confidence=round(room_confidence, 2) if matched_room else 0.0
 
 
482
  )
483
+ enriched_areas.append(enriched_area)
484
+
485
+ # Calculate metadata
486
+ total_items = sum(len(area.items) for area in enriched_areas)
487
+ validated_stages = sum(1 for area in enriched_areas for item in area.items if item.stageId)
488
+ validated_tasks = sum(1 for area in enriched_areas for item in area.items if item.taskId)
489
+ validated_materials = sum(1 for area in enriched_areas for item in area.items if item.materialId)
490
+
491
+ metadata = {
492
+ 'total_areas': len(enriched_areas),
493
+ 'total_items': total_items,
494
+ 'validated_stages': validated_stages,
495
+ 'validated_tasks': validated_tasks,
496
+ 'validated_materials': validated_materials,
497
+ 'semantic_mismatches': semantic_mismatches,
498
+ 'validation_rate': {
499
+ 'stages': round(validated_stages / total_items * 100, 1) if total_items > 0 else 0,
500
+ 'tasks': round(validated_tasks / total_items * 100, 1) if total_items > 0 else 0,
501
+ 'materials': round(validated_materials / total_items * 100, 1) if total_items > 0 else 0
502
+ }
503
  }
504
 
505
+ return ScopeResponse(scope_of_work=enriched_areas, metadata=metadata)
506
 
507
  # ============= API ENDPOINTS =============
508
  @app.get("/")
509
  async def root():
510
  return {
511
+ "service": "Construction Scope Validator - FIXED",
512
+ "version": "2.1.0",
513
  "status": "running",
514
+ "features": ["semantic_task_validation", "fallback_search"],
515
  "data_loaded": len(db.stages) > 0,
516
+ "model_type": "trained" if os.path.exists('model.safetensors') else "base",
517
+ "gpu_enabled": torch.cuda.is_available(),
518
+ "batch_size": BATCH_SIZE
519
  }
520
 
521
  @app.get("/health")
 
526
  "tasks_loaded": len(db.tasks),
527
  "materials_loaded": len(db.materials),
528
  "rooms_loaded": len(db.rooms),
529
+ "embeddings_ready": db.stage_embeddings is not None
 
530
  }
531
 
532
+ @app.post("/validate", response_model=ScopeResponse)
533
+ async def validate_scope_endpoint(request: ScopeRequest):
534
+ """Validate with semantic checking"""
535
  try:
536
  if not db.stages:
537
  raise HTTPException(status_code=500, detail="Database not loaded")
 
542
  error_detail = f"Validation error: {str(e)}\n{traceback.format_exc()}"
543
  raise HTTPException(status_code=500, detail=error_detail)
544
 
545
+ @app.post("/validate-simple", response_model=ScopeRequest)
546
+ async def validate_scope_simple(request: ScopeRequest):
547
+ """Returns only enriched scope without metadata"""
548
+ try:
549
+ if not db.stages:
550
+ raise HTTPException(status_code=500, detail="Database not loaded")
551
+ result = validate_scope(request)
552
+ return ScopeRequest(scope_of_work=result.scope_of_work)
553
+ except Exception as e:
554
+ import traceback
555
+ error_detail = f"Validation error: {str(e)}\n{traceback.format_exc()}"
556
+ raise HTTPException(status_code=500, detail=error_detail)
 
 
 
 
 
 
 
 
 
 
 
557
 
558
  # ============= STARTUP =============
559
  @app.on_event("startup")
560
  async def startup_event():
 
561
  try:
562
  print("\n" + "="*60)
563
+ print("STARTING UP - FIXED VERSION")
564
  print("="*60)
565
 
566
+ if torch.cuda.is_available():
567
+ print(f"\n🚀 GPU ENABLED: {torch.cuda.get_device_name(0)}")
568
+
569
  db.load_data(
570
  stages_file='stages.json',
571
  tasks_file='tasks.json',
 
574
  )
575
  db.initialize_embeddings()
576
 
577
+ print("\n SERVICE READY WITH SEMANTIC VALIDATION!")
 
578
  print("="*60)
579
  except Exception as e:
580
  print(f"\n❌ STARTUP ERROR: {e}")
 
584
  if __name__ == "__main__":
585
  import uvicorn
586
  uvicorn.run(app, host="0.0.0.0", port=7860)
587
+
588
  # """
589
  # FastAPI Service for Construction Scope Validation
590
+ # Deploy on Hugging Face Spaces - Flattened File Structure
591
  # """
592
  # from fastapi import FastAPI, HTTPException
593
  # from fastapi.middleware.cors import CORSMiddleware
 
596
  # import json
597
  # import numpy as np
598
  # import os
599
+ # import shutil
600
  # from sentence_transformers import SentenceTransformer
601
  # from sklearn.metrics.pairwise import cosine_similarity
602
  # import re
 
606
  # description="Validates and enriches LLM-generated construction scope with DB data",
607
  # version="1.0.0"
608
  # )
609
+ # #---------------------------
610
 
611
  # # CORS middleware
612
  # app.add_middleware(
 
617
  # allow_headers=["*"],
618
  # )
619
 
620
+ # # ============= MODEL LOADING WITH FLAT STRUCTURE =============
621
  # print("="*60)
622
  # print("LOADING MODEL...")
623
  # print("="*60)
624
+
625
+ # def setup_model_structure():
626
+ # """
627
+ # Create temporary folder structure for sentence-transformers
628
+ # if files are in root (flattened structure)
629
+ # """
630
+ # # Check if we need to create structure
631
+ # if not os.path.exists('1_Pooling') or not os.path.exists('2_Normalize'):
632
+ # print("Creating temporary model structure...")
633
+
634
+ # # Create directories
635
+ # os.makedirs('1_Pooling', exist_ok=True)
636
+ # os.makedirs('2_Normalize', exist_ok=True)
637
+
638
+ # # Pooling config
639
+ # pooling_config = {
640
+ # "word_embedding_dimension": 384,
641
+ # "pooling_mode_cls_token": False,
642
+ # "pooling_mode_mean_tokens": True,
643
+ # "pooling_mode_max_tokens": False,
644
+ # "pooling_mode_mean_sqrt_len_tokens": False
645
+ # }
646
+ # with open('1_Pooling/config.json', 'w') as f:
647
+ # json.dump(pooling_config, f, indent=2)
648
+
649
+ # # Normalize config (empty is fine)
650
+ # with open('2_Normalize/config.json', 'w') as f:
651
+ # json.dump({}, f)
652
+
653
+ # print("✓ Created 1_Pooling/config.json")
654
+ # print("✓ Created 2_Normalize/config.json")
655
+
656
+ # # Setup structure before loading model
657
+ # setup_model_structure()
658
+
659
  # try:
660
  # model_files = ['config.json', 'sentence_bert_config.json']
661
  # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
662
  # has_model = all(os.path.exists(f) for f in model_files) and has_weights
663
 
664
  # if has_model:
665
+ # print("✓ Model files found in root directory")
666
  # print("Loading trained model...")
667
  # embedding_model = SentenceTransformer('./', device='cpu')
668
  # print("✅ Trained model loaded successfully!")
669
  # else:
670
+ # print("⚠️ Model not found, using base model...")
671
  # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
672
  # print("✅ Base model loaded successfully!")
673
  # except Exception as e:
 
733
 
734
  # # ============= HELPER FUNCTION =============
735
  # def parse_room_area(room_area_value):
736
+ # """Parse roomArea field which might be a string, list, or None"""
 
 
 
737
  # if room_area_value is None:
738
  # return []
739
 
 
740
  # if isinstance(room_area_value, list):
741
  # return room_area_value
742
 
 
743
  # if isinstance(room_area_value, str):
744
  # try:
745
  # parsed = json.loads(room_area_value)
 
747
  # return parsed
748
  # return [str(parsed)]
749
  # except json.JSONDecodeError:
 
750
  # return [room_area_value]
751
 
 
752
  # return [str(room_area_value)]
753
 
754
  # # ============= DATABASE LOADERS =============
 
818
  # """Find closest matching room from DB"""
819
  # llm_area_lower = llm_area.lower()
820
 
 
821
  # for room in db.rooms:
822
  # if room['name'].lower() == llm_area_lower:
823
  # return room, 1.0
824
 
 
825
  # room_texts = [r['name'] for r in db.rooms]
826
  # query_embedding = embedding_model.encode([llm_area])
827
  # room_embeddings = embedding_model.encode(room_texts)
 
941
  # for m, score in material_matches
942
  # ]
943
 
 
944
  # validated_task = ValidatedTask(
945
  # taskId=best_task['taskId'],
946
  # task=best_task['task'],
947
  # displayName=best_task['displayName'],
948
  # unit=best_task['unit'],
949
  # stageId=best_task['stageId'],
950
+ # roomArea=parse_room_area(best_task['roomArea']),
951
  # confidence_score=round(task_confidence, 2),
952
  # recommended_materials=validated_materials
953
  # )
 
1021
 
1022
  # @app.post("/validate", response_model=ValidatedResponse)
1023
  # async def validate_scope_endpoint(request: LLMScopeRequest):
1024
+ # """Validate LLM-generated scope against database"""
 
 
 
1025
  # try:
1026
  # if not db.stages:
1027
  # raise HTTPException(status_code=500, detail="Database not loaded")
 
1078
  # print("="*60)
1079
  # except Exception as e:
1080
  # print(f"\n❌ STARTUP ERROR: {e}")
 
1081
  # import traceback
1082
  # traceback.print_exc()
1083
 
1084
  # if __name__ == "__main__":
1085
  # import uvicorn
1086
  # uvicorn.run(app, host="0.0.0.0", port=7860)
1087
+ # # """
1088
+ # # FastAPI Service for Construction Scope Validation
1089
+ # # Deploy on Hugging Face Spaces
1090
+ # # """
1091
+ # # from fastapi import FastAPI, HTTPException
1092
+ # # from fastapi.middleware.cors import CORSMiddleware
1093
+ # # from pydantic import BaseModel, Field
1094
+ # # from typing import List, Optional, Dict, Any
1095
+ # # import json
1096
+ # # import numpy as np
1097
+ # # import os
1098
+ # # from sentence_transformers import SentenceTransformer
1099
+ # # from sklearn.metrics.pairwise import cosine_similarity
1100
+ # # import re
1101
+
1102
+ # # app = FastAPI(
1103
+ # # title="Construction Scope Validator API",
1104
+ # # description="Validates and enriches LLM-generated construction scope with DB data",
1105
+ # # version="1.0.0"
1106
+ # # )
1107
+
1108
+ # # # CORS middleware
1109
+ # # app.add_middleware(
1110
+ # # CORSMiddleware,
1111
+ # # allow_origins=["*"],
1112
+ # # allow_credentials=True,
1113
+ # # allow_methods=["*"],
1114
+ # # allow_headers=["*"],
1115
+ # # )
1116
+
1117
+ # # # Load embedding model (cached globally)
1118
+ # # print("="*60)
1119
+ # # print("LOADING MODEL...")
1120
+ # # print("="*60)
1121
+ # # try:
1122
+ # # model_files = ['config.json', 'sentence_bert_config.json']
1123
+ # # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
1124
+ # # has_model = all(os.path.exists(f) for f in model_files) and has_weights
1125
+
1126
+ # # if has_model:
1127
+ # # print("✓ Trained model files found in root directory")
1128
+ # # print("Loading trained model...")
1129
+ # # embedding_model = SentenceTransformer('./', device='cpu')
1130
+ # # print("✅ Trained model loaded successfully!")
1131
+ # # else:
1132
+ # # print("⚠️ Trained model not found, using base model...")
1133
+ # # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1134
+ # # print("✅ Base model loaded successfully!")
1135
+ # # except Exception as e:
1136
+ # # print(f"❌ Error loading trained model: {e}")
1137
+ # # print("Falling back to base model...")
1138
+ # # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1139
+ # # print("✅ Base model loaded successfully!")
1140
+ # # print("="*60)
1141
+
1142
+ # # # ============= DATA MODELS =============
1143
+ # # class LLMScopeItem(BaseModel):
1144
+ # # stage: str
1145
+ # # task: str
1146
+ # # material: str
1147
+ # # quantity: float
1148
+ # # unit: str
1149
+
1150
+ # # class LLMAreaScope(BaseModel):
1151
+ # # area: str
1152
+ # # items: List[LLMScopeItem]
1153
+
1154
+ # # class LLMScopeRequest(BaseModel):
1155
+ # # scope_of_work: List[LLMAreaScope]
1156
+
1157
+ # # class ValidatedMaterial(BaseModel):
1158
+ # # materialId: int
1159
+ # # name: str
1160
+ # # material: str
1161
+ # # unit: str
1162
+ # # price: float
1163
+ # # margin: float
1164
+ # # categories: List[str]
1165
+ # # confidence_score: float
1166
+
1167
+ # # class ValidatedTask(BaseModel):
1168
+ # # taskId: int
1169
+ # # task: str
1170
+ # # displayName: str
1171
+ # # unit: str
1172
+ # # stageId: int
1173
+ # # roomArea: List[str]
1174
+ # # confidence_score: float
1175
+ # # recommended_materials: List[ValidatedMaterial]
1176
+
1177
+ # # class ValidatedStage(BaseModel):
1178
+ # # stageId: int
1179
+ # # stage: str
1180
+ # # priority: int
1181
+ # # confidence_score: float
1182
+ # # tasks: List[ValidatedTask]
1183
+
1184
+ # # class ValidatedArea(BaseModel):
1185
+ # # roomId: Optional[int]
1186
+ # # name: str
1187
+ # # roomType: str
1188
+ # # matched: bool
1189
+ # # confidence_score: float
1190
+ # # stages: List[ValidatedStage]
1191
+
1192
+ # # class ValidatedResponse(BaseModel):
1193
+ # # areas: List[ValidatedArea]
1194
+ # # summary: Dict[str, Any]
1195
+
1196
+ # # # ============= HELPER FUNCTION =============
1197
+ # # def parse_room_area(room_area_value):
1198
+ # # """
1199
+ # # Parse roomArea field which might be a string, list, or None
1200
+ # # Returns a proper list of strings
1201
+ # # """
1202
+ # # if room_area_value is None:
1203
+ # # return []
1204
+
1205
+ # # # If it's already a list, return it
1206
+ # # if isinstance(room_area_value, list):
1207
+ # # return room_area_value
1208
+
1209
+ # # # If it's a string, try to parse it as JSON
1210
+ # # if isinstance(room_area_value, str):
1211
+ # # try:
1212
+ # # parsed = json.loads(room_area_value)
1213
+ # # if isinstance(parsed, list):
1214
+ # # return parsed
1215
+ # # return [str(parsed)]
1216
+ # # except json.JSONDecodeError:
1217
+ # # # If JSON parsing fails, treat it as a single item
1218
+ # # return [room_area_value]
1219
+
1220
+ # # # Fallback: convert to string and wrap in list
1221
+ # # return [str(room_area_value)]
1222
+
1223
+ # # # ============= DATABASE LOADERS =============
1224
+ # # class DatabaseLoader:
1225
+ # # def __init__(self):
1226
+ # # self.stages = []
1227
+ # # self.tasks = []
1228
+ # # self.materials = []
1229
+ # # self.rooms = []
1230
+ # # self.stage_embeddings = None
1231
+ # # self.task_embeddings = None
1232
+ # # self.material_embeddings = None
1233
+
1234
+ # # def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
1235
+ # # """Load JSON data files"""
1236
+ # # print(f"Loading {stages_file}...")
1237
+ # # with open(stages_file, 'r', encoding='utf-8') as f:
1238
+ # # self.stages = [json.loads(line) for line in f if line.strip()]
1239
 
1240
+ # # print(f"Loading {tasks_file}...")
1241
+ # # with open(tasks_file, 'r', encoding='utf-8') as f:
1242
+ # # self.tasks = [json.loads(line) for line in f if line.strip()]
1243
+
1244
+ # # print(f"Loading {materials_file}...")
1245
+ # # with open(materials_file, 'r', encoding='utf-8') as f:
1246
+ # # self.materials = [json.loads(line) for line in f if line.strip()]
1247
+
1248
+ # # print(f"Loading {rooms_file}...")
1249
+ # # with open(rooms_file, 'r', encoding='utf-8') as f:
1250
+ # # self.rooms = [json.loads(line) for line in f if line.strip()]
1251
+
1252
+ # # print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
1253
+ # # f"{len(self.materials)} materials, {len(self.rooms)} rooms")
1254
+
1255
+ # # def initialize_embeddings(self):
1256
+ # # """Pre-compute embeddings for fast lookup"""
1257
+ # # print("Computing stage embeddings...")
1258
+ # # stage_texts = [s['stage'] for s in self.stages]
1259
+ # # self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
1260
+
1261
+ # # print("Computing task embeddings...")
1262
+ # # task_texts = [t['task'] for t in self.tasks]
1263
+ # # self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
1264
+
1265
+ # # print("Computing material embeddings...")
1266
+ # # material_texts = [m['material'] for m in self.materials]
1267
+ # # self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
1268
+
1269
+ # # print("✅ Embeddings ready!")
1270
+
1271
+ # # # Global DB instance
1272
+ # # db = DatabaseLoader()
1273
+
1274
+ # # # ============= MATCHING FUNCTIONS =============
1275
+ # # def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
1276
+ # # """Find closest matching stage from DB"""
1277
+ # # query_embedding = embedding_model.encode([llm_stage])
1278
+ # # similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
1279
+ # # best_idx = np.argmax(similarities)
1280
+ # # best_score = similarities[best_idx]
1281
+
1282
+ # # if best_score >= threshold:
1283
+ # # return db.stages[best_idx], best_score
1284
+ # # return None, 0.0
1285
 
1286
+ # # def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
1287
+ # # """Find closest matching room from DB"""
1288
+ # # llm_area_lower = llm_area.lower()
1289
+
1290
+ # # # Exact match first
1291
+ # # for room in db.rooms:
1292
+ # # if room['name'].lower() == llm_area_lower:
1293
+ # # return room, 1.0
1294
+
1295
+ # # # Fuzzy match
1296
+ # # room_texts = [r['name'] for r in db.rooms]
1297
+ # # query_embedding = embedding_model.encode([llm_area])
1298
+ # # room_embeddings = embedding_model.encode(room_texts)
1299
+ # # similarities = cosine_similarity(query_embedding, room_embeddings)[0]
1300
+
1301
+ # # best_idx = np.argmax(similarities)
1302
+ # # best_score = similarities[best_idx]
1303
+
1304
+ # # if best_score >= threshold:
1305
+ # # return db.rooms[best_idx], best_score
1306
+ # # return None, 0.0
1307
+
1308
+ # # def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
1309
+ # # """Find relevant tasks for a stage matching LLM task description"""
1310
+ # # stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
1311
+ # # if not stage_tasks:
1312
+ # # return []
1313
+
1314
+ # # task_indices = [db.tasks.index(t) for t in stage_tasks]
1315
+ # # query_embedding = embedding_model.encode([llm_task])
1316
+ # # stage_task_embeddings = db.task_embeddings[task_indices]
1317
+ # # similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
1318
+
1319
+ # # top_indices = np.argsort(similarities)[-top_k:][::-1]
1320
+ # # results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
1321
+ # # return results
1322
+
1323
+ # # def extract_keywords(text: str) -> List[str]:
1324
+ # # """Extract meaningful keywords from text"""
1325
+ # # stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
1326
+ # # words = re.findall(r'\b\w+\b', text.lower())
1327
+ # # return [w for w in words if w not in stop_words and len(w) > 2]
1328
+
1329
+ # # def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
1330
+ # # """Find materials matching task requirements"""
1331
+ # # task_keywords = extract_keywords(task['task'])
1332
+ # # llm_keywords = extract_keywords(llm_material)
1333
+ # # all_keywords = set(task_keywords + llm_keywords)
1334
+
1335
+ # # compatible_materials = [
1336
+ # # m for m in db.materials
1337
+ # # if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
1338
+ # # ]
1339
+ # # if not compatible_materials:
1340
+ # # compatible_materials = db.materials
1341
+
1342
+ # # scored_materials = []
1343
+ # # for material in compatible_materials:
1344
+ # # score = 0.0
1345
+ # # material_text = material['material'].lower()
1346
+
1347
+ # # for keyword in all_keywords:
1348
+ # # if keyword in material_text:
1349
+ # # score += 2.0
1350
+
1351
+ # # categories_str = ' '.join(material.get('categories', [])).lower()
1352
+ # # for keyword in all_keywords:
1353
+ # # if keyword in categories_str:
1354
+ # # score += 1.0
1355
+
1356
+ # # material_idx = db.materials.index(material)
1357
+ # # query_embedding = embedding_model.encode([llm_material])
1358
+ # # material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
1359
+ # # semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
1360
+ # # score += semantic_score * 5.0
1361
+
1362
+ # # if score > 0:
1363
+ # # scored_materials.append((material, score))
1364
+
1365
+ # # scored_materials.sort(key=lambda x: x[1], reverse=True)
1366
+ # # return scored_materials[:top_k]
1367
 
1368
+ # # # ============= VALIDATION PIPELINE =============
1369
+ # # def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
1370
+ # # """Main validation pipeline"""
1371
+ # # validated_areas = []
1372
+
1373
+ # # for area_scope in llm_scope.scope_of_work:
1374
+ # # matched_room, room_confidence = find_best_room(area_scope.area)
1375
+ # # validated_stages_dict = {}
1376
+
1377
+ # # for item in area_scope.items:
1378
+ # # matched_stage, stage_confidence = find_best_stage(item.stage)
1379
+ # # if not matched_stage:
1380
+ # # continue
1381
+
1382
+ # # stage_id = matched_stage['stageId']
1383
+
1384
+ # # if stage_id not in validated_stages_dict:
1385
+ # # validated_stages_dict[stage_id] = {
1386
+ # # 'stage_data': matched_stage,
1387
+ # # 'confidence': stage_confidence,
1388
+ # # 'tasks': []
1389
+ # # }
1390
+
1391
+ # # task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
1392
+ # # if not task_matches:
1393
+ # # continue
1394
+
1395
+ # # best_task, task_confidence = task_matches[0]
1396
+
1397
+ # # material_matches = find_materials_for_task(
1398
+ # # best_task, item.material, item.unit, top_k=5
1399
+ # # )
1400
+
1401
+ # # validated_materials = [
1402
+ # # ValidatedMaterial(
1403
+ # # materialId=m['materialId'],
1404
+ # # name=m['name'],
1405
+ # # material=m['material'],
1406
+ # # unit=m['unit'] or 'unit',
1407
+ # # price=float(m['price']),
1408
+ # # margin=float(m['margin']),
1409
+ # # categories=m['categories'],
1410
+ # # confidence_score=round(score / 10.0, 2)
1411
+ # # )
1412
+ # # for m, score in material_matches
1413
+ # # ]
1414
+
1415
+ # # # FIX: Parse roomArea properly
1416
+ # # validated_task = ValidatedTask(
1417
+ # # taskId=best_task['taskId'],
1418
+ # # task=best_task['task'],
1419
+ # # displayName=best_task['displayName'],
1420
+ # # unit=best_task['unit'],
1421
+ # # stageId=best_task['stageId'],
1422
+ # # roomArea=parse_room_area(best_task['roomArea']), # <-- FIXED HERE
1423
+ # # confidence_score=round(task_confidence, 2),
1424
+ # # recommended_materials=validated_materials
1425
+ # # )
1426
+
1427
+ # # validated_stages_dict[stage_id]['tasks'].append(validated_task)
1428
+
1429
+ # # validated_stages = [
1430
+ # # ValidatedStage(
1431
+ # # stageId=stage_data['stage_data']['stageId'],
1432
+ # # stage=stage_data['stage_data']['stage'],
1433
+ # # priority=stage_data['stage_data']['priority'],
1434
+ # # confidence_score=round(stage_data['confidence'], 2),
1435
+ # # tasks=stage_data['tasks']
1436
+ # # )
1437
+ # # for stage_data in validated_stages_dict.values()
1438
+ # # ]
1439
+
1440
+ # # validated_stages.sort(key=lambda x: x.priority)
1441
+
1442
+ # # validated_area = ValidatedArea(
1443
+ # # roomId=matched_room['id'] if matched_room else None,
1444
+ # # name=matched_room['name'] if matched_room else area_scope.area,
1445
+ # # roomType=matched_room['roomType'] if matched_room else 'unknown',
1446
+ # # matched=matched_room is not None,
1447
+ # # confidence_score=round(room_confidence, 2),
1448
+ # # stages=validated_stages
1449
+ # # )
1450
+
1451
+ # # validated_areas.append(validated_area)
1452
+
1453
+ # # summary = {
1454
+ # # 'total_areas': len(validated_areas),
1455
+ # # 'total_stages': sum(len(a.stages) for a in validated_areas),
1456
+ # # 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
1457
+ # # 'total_materials': sum(
1458
+ # # len(t.recommended_materials)
1459
+ # # for a in validated_areas
1460
+ # # for s in a.stages
1461
+ # # for t in s.tasks
1462
+ # # ),
1463
+ # # 'matched_areas': sum(1 for a in validated_areas if a.matched),
1464
+ # # 'avg_confidence': round(
1465
+ # # np.mean([a.confidence_score for a in validated_areas]), 2
1466
+ # # ) if validated_areas else 0.0
1467
+ # # }
1468
+
1469
+ # # return ValidatedResponse(areas=validated_areas, summary=summary)
1470
+
1471
+ # # # ============= API ENDPOINTS =============
1472
+ # # @app.get("/")
1473
+ # # async def root():
1474
+ # # return {
1475
+ # # "service": "Construction Scope Validator",
1476
+ # # "version": "1.0.0",
1477
+ # # "status": "running",
1478
+ # # "data_loaded": len(db.stages) > 0,
1479
+ # # "model_type": "trained" if os.path.exists('model.safetensors') else "base"
1480
+ # # }
1481
+
1482
+ # # @app.get("/health")
1483
+ # # async def health():
1484
+ # # return {
1485
+ # # "status": "healthy",
1486
+ # # "stages_loaded": len(db.stages),
1487
+ # # "tasks_loaded": len(db.tasks),
1488
+ # # "materials_loaded": len(db.materials),
1489
+ # # "rooms_loaded": len(db.rooms),
1490
+ # # "embeddings_ready": db.stage_embeddings is not None,
1491
+ # # "model_type": "trained" if os.path.exists('model.safetensors') else "base"
1492
+ # # }
1493
+
1494
+ # # @app.post("/validate", response_model=ValidatedResponse)
1495
+ # # async def validate_scope_endpoint(request: LLMScopeRequest):
1496
+ # # """
1497
+ # # Validate LLM-generated scope against database
1498
+ # # Returns enriched data with matched stages, tasks, materials, and confidence scores
1499
+ # # """
1500
+ # # try:
1501
+ # # if not db.stages:
1502
+ # # raise HTTPException(status_code=500, detail="Database not loaded")
1503
+ # # result = validate_scope(request)
1504
+ # # return result
1505
+ # # except Exception as e:
1506
+ # # import traceback
1507
+ # # error_detail = f"Validation error: {str(e)}\n{traceback.format_exc()}"
1508
+ # # raise HTTPException(status_code=500, detail=error_detail)
1509
+
1510
+ # # @app.post("/match-stage")
1511
+ # # async def match_stage(stage_name: str):
1512
+ # # """Test endpoint: match a single stage name"""
1513
+ # # matched_stage, confidence = find_best_stage(stage_name)
1514
+ # # if matched_stage:
1515
+ # # return {
1516
+ # # "input": stage_name,
1517
+ # # "matched": matched_stage,
1518
+ # # "confidence": round(confidence, 2)
1519
+ # # }
1520
+ # # return {"input": stage_name, "matched": None, "confidence": 0.0}
1521
+
1522
+ # # @app.post("/match-room")
1523
+ # # async def match_room(room_name: str):
1524
+ # # """Test endpoint: match a single room name"""
1525
+ # # matched_room, confidence = find_best_room(room_name)
1526
+ # # if matched_room:
1527
+ # # return {
1528
+ # # "input": room_name,
1529
+ # # "matched": matched_room,
1530
+ # # "confidence": round(confidence, 2)
1531
+ # # }
1532
+ # # return {"input": room_name, "matched": None, "confidence": 0.0}
1533
+
1534
+ # # # ============= STARTUP =============
1535
+ # # @app.on_event("startup")
1536
+ # # async def startup_event():
1537
+ # # """Load data and initialize embeddings on startup"""
1538
+ # # try:
1539
+ # # print("\n" + "="*60)
1540
+ # # print("STARTING UP...")
1541
+ # # print("="*60)
1542
+
1543
+ # # db.load_data(
1544
+ # # stages_file='stages.json',
1545
+ # # tasks_file='tasks.json',
1546
+ # # materials_file='materials.json',
1547
+ # # rooms_file='rooms.json'
1548
+ # # )
1549
+ # # db.initialize_embeddings()
1550
+
1551
+ # # print("\n" + "="*60)
1552
+ # # print("✅ SERVICE READY!")
1553
+ # # print("="*60)
1554
+ # # except Exception as e:
1555
+ # # print(f"\n❌ STARTUP ERROR: {e}")
1556
+ # # print("Make sure JSON files are in the correct location")
1557
+ # # import traceback
1558
+ # # traceback.print_exc()
1559
+
1560
+ # # if __name__ == "__main__":
1561
+ # # import uvicorn
1562
+ # # uvicorn.run(app, host="0.0.0.0", port=7860)
1563
+
1564
+ # # """
1565
+ # # FastAPI Service for Construction Scope Validation
1566
+ # # Deploy on Hugging Face Spaces
1567
+ # # """
1568
+
1569
+ # # from fastapi import FastAPI, HTTPException
1570
+ # # from fastapi.middleware.cors import CORSMiddleware
1571
+ # # from pydantic import BaseModel, Field
1572
+ # # from typing import List, Optional, Dict, Any
1573
+ # # import json
1574
+ # # import numpy as np
1575
+ # # import os
1576
+ # # from sentence_transformers import SentenceTransformer
1577
+ # # from sklearn.metrics.pairwise import cosine_similarity
1578
+ # # import re
1579
+
1580
+ # # app = FastAPI(
1581
+ # # title="Construction Scope Validator API",
1582
+ # # description="Validates and enriches LLM-generated construction scope with DB data",
1583
+ # # version="1.0.0"
1584
+ # # )
1585
+
1586
+ # # # CORS middleware
1587
+ # # app.add_middleware(
1588
+ # # CORSMiddleware,
1589
+ # # allow_origins=["*"],
1590
+ # # allow_credentials=True,
1591
+ # # allow_methods=["*"],
1592
+ # # allow_headers=["*"],
1593
+ # # )
1594
+
1595
+ # # # Load embedding model (cached globally)
1596
+ # # # Try to load trained model from root, fallback to base model
1597
+ # # print("="*60)
1598
+ # # print("LOADING MODEL...")
1599
+ # # print("="*60)
1600
+
1601
+ # # try:
1602
+ # # # Check if trained model files exist in root
1603
+ # # # Check if trained model files exist in root
1604
+ # # model_files = ['config.json', 'sentence_bert_config.json']
1605
+ # # # Check for either pytorch_model.bin or model.safetensors
1606
+ # # has_weights = os.path.exists('pytorch_model.bin') or os.path.exists('model.safetensors')
1607
+ # # has_model = all(os.path.exists(f) for f in model_files) and has_weights
1608
+
1609
+ # # if has_model:
1610
+ # # print("✓ Trained model files found in root directory")
1611
+ # # print("Loading trained model...")
1612
+ # # embedding_model = SentenceTransformer('./', device='cpu')
1613
+ # # print("✅ Trained model loaded successfully!")
1614
+ # # else:
1615
+ # # print("⚠️ Trained model not found, using base model...")
1616
+ # # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1617
+ # # print("✅ Base model loaded successfully!")
1618
+ # # except Exception as e:
1619
+ # # print(f"❌ Error loading trained model: {e}")
1620
+ # # print("Falling back to base model...")
1621
+ # # embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
1622
+ # # print("✅ Base model loaded successfully!")
1623
+
1624
+ # # print("="*60)
1625
+
1626
+ # # # ============= DATA MODELS =============
1627
+
1628
+ # # class LLMScopeItem(BaseModel):
1629
+ # # stage: str
1630
+ # # task: str
1631
+ # # material: str
1632
+ # # quantity: float
1633
+ # # unit: str
1634
+
1635
+ # # class LLMAreaScope(BaseModel):
1636
+ # # area: str
1637
+ # # items: List[LLMScopeItem]
1638
+
1639
+ # # class LLMScopeRequest(BaseModel):
1640
+ # # scope_of_work: List[LLMAreaScope]
1641
+
1642
+ # # class ValidatedMaterial(BaseModel):
1643
+ # # materialId: int
1644
+ # # name: str
1645
+ # # material: str
1646
+ # # unit: str
1647
+ # # price: float
1648
+ # # margin: float
1649
+ # # categories: List[str]
1650
+ # # confidence_score: float
1651
+
1652
+ # # class ValidatedTask(BaseModel):
1653
+ # # taskId: int
1654
+ # # task: str
1655
+ # # displayName: str
1656
+ # # unit: str
1657
+ # # stageId: int
1658
+ # # roomArea: List[str]
1659
+ # # confidence_score: float
1660
+ # # recommended_materials: List[ValidatedMaterial]
1661
+
1662
+ # # class ValidatedStage(BaseModel):
1663
+ # # stageId: int
1664
+ # # stage: str
1665
+ # # priority: int
1666
+ # # confidence_score: float
1667
+ # # tasks: List[ValidatedTask]
1668
+
1669
+ # # class ValidatedArea(BaseModel):
1670
+ # # roomId: Optional[int]
1671
+ # # name: str
1672
+ # # roomType: str
1673
+ # # matched: bool
1674
+ # # confidence_score: float
1675
+ # # stages: List[ValidatedStage]
1676
+
1677
+ # # class ValidatedResponse(BaseModel):
1678
+ # # areas: List[ValidatedArea]
1679
+ # # summary: Dict[str, Any]
1680
+
1681
+ # # # ============= DATABASE LOADERS =============
1682
+
1683
+ # # class DatabaseLoader:
1684
+ # # def __init__(self):
1685
+ # # self.stages = []
1686
+ # # self.tasks = []
1687
+ # # self.materials = []
1688
+ # # self.rooms = []
1689
+ # # self.stage_embeddings = None
1690
+ # # self.task_embeddings = None
1691
+ # # self.material_embeddings = None
1692
+
1693
+ # # def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
1694
+ # # """Load JSON data files"""
1695
+ # # print(f"Loading {stages_file}...")
1696
+ # # with open(stages_file, 'r', encoding='utf-8') as f:
1697
+ # # self.stages = [json.loads(line) for line in f if line.strip()]
1698
+
1699
+ # # print(f"Loading {tasks_file}...")
1700
+ # # with open(tasks_file, 'r', encoding='utf-8') as f:
1701
+ # # self.tasks = [json.loads(line) for line in f if line.strip()]
1702
+
1703
+ # # print(f"Loading {materials_file}...")
1704
+ # # with open(materials_file, 'r', encoding='utf-8') as f:
1705
+ # # self.materials = [json.loads(line) for line in f if line.strip()]
1706
+
1707
+ # # print(f"Loading {rooms_file}...")
1708
+ # # with open(rooms_file, 'r', encoding='utf-8') as f:
1709
+ # # self.rooms = [json.loads(line) for line in f if line.strip()]
1710
+
1711
+ # # print(f"✅ Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
1712
+ # # f"{len(self.materials)} materials, {len(self.rooms)} rooms")
1713
+
1714
+ # # def initialize_embeddings(self):
1715
+ # # """Pre-compute embeddings for fast lookup"""
1716
+ # # print("Computing stage embeddings...")
1717
+ # # stage_texts = [s['stage'] for s in self.stages]
1718
+ # # self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
1719
+
1720
+ # # print("Computing task embeddings...")
1721
+ # # task_texts = [t['task'] for t in self.tasks]
1722
+ # # self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
1723
+
1724
+ # # print("Computing material embeddings...")
1725
+ # # material_texts = [m['material'] for m in self.materials]
1726
+ # # self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
1727
+
1728
+ # # print("✅ Embeddings ready!")
1729
 
1730
+ # # # Global DB instance
1731
+ # # db = DatabaseLoader()
 
 
 
 
1732
 
1733
+ # # # ============= MATCHING FUNCTIONS =============
 
 
1734
 
1735
+ # # def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
1736
+ # # """Find closest matching stage from DB"""
1737
+ # # query_embedding = embedding_model.encode([llm_stage])
1738
+ # # similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
1739
+
1740
+ # # best_idx = np.argmax(similarities)
1741
+ # # best_score = similarities[best_idx]
1742
+
1743
+ # # if best_score >= threshold:
1744
+ # # return db.stages[best_idx], best_score
1745
+ # # return None, 0.0
1746
 
1747
+ # # def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
1748
+ # # """Find closest matching room from DB"""
1749
+ # # llm_area_lower = llm_area.lower()
1750
+
1751
+ # # # Exact match first
1752
+ # # for room in db.rooms:
1753
+ # # if room['name'].lower() == llm_area_lower:
1754
+ # # return room, 1.0
1755
+
1756
+ # # # Fuzzy match
1757
+ # # room_texts = [r['name'] for r in db.rooms]
1758
+ # # query_embedding = embedding_model.encode([llm_area])
1759
+ # # room_embeddings = embedding_model.encode(room_texts)
1760
+ # # similarities = cosine_similarity(query_embedding, room_embeddings)[0]
1761
+
1762
+ # # best_idx = np.argmax(similarities)
1763
+ # # best_score = similarities[best_idx]
1764
+
1765
+ # # if best_score >= threshold:
1766
+ # # return db.rooms[best_idx], best_score
1767
+ # # return None, 0.0
1768
+
1769
+ # # def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
1770
+ # # """Find relevant tasks for a stage matching LLM task description"""
1771
+ # # # Filter tasks by stage
1772
+ # # stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
1773
+
1774
+ # # if not stage_tasks:
1775
+ # # return []
1776
+
1777
+ # # # Compute similarities
1778
+ # # task_indices = [db.tasks.index(t) for t in stage_tasks]
1779
+ # # query_embedding = embedding_model.encode([llm_task])
1780
+
1781
+ # # stage_task_embeddings = db.task_embeddings[task_indices]
1782
+ # # similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
1783
+
1784
+ # # # Get top K
1785
+ # # top_indices = np.argsort(similarities)[-top_k:][::-1]
1786
+ # # results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
1787
+
1788
+ # # return results
1789
+
1790
+ # # def extract_keywords(text: str) -> List[str]:
1791
+ # # """Extract meaningful keywords from text"""
1792
+ # # # Remove common words
1793
+ # # stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
1794
+ # # words = re.findall(r'\b\w+\b', text.lower())
1795
+ # # return [w for w in words if w not in stop_words and len(w) > 2]
1796
+
1797
+ # # def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
1798
+ # # """Find materials matching task requirements"""
1799
+ # # task_keywords = extract_keywords(task['task'])
1800
+ # # llm_keywords = extract_keywords(llm_material)
1801
+ # # all_keywords = set(task_keywords + llm_keywords)
1802
+
1803
+ # # # Filter by unit compatibility
1804
+ # # compatible_materials = [
1805
+ # # m for m in db.materials
1806
+ # # if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
1807
+ # # ]
1808
+
1809
+ # # if not compatible_materials:
1810
+ # # # Fallback: allow any unit
1811
+ # # compatible_materials = db.materials
1812
+
1813
+ # # # Score materials
1814
+ # # scored_materials = []
1815
+ # # for material in compatible_materials:
1816
+ # # score = 0.0
1817
+ # # material_text = material['material'].lower()
1818
+
1819
+ # # # Keyword matching
1820
+ # # for keyword in all_keywords:
1821
+ # # if keyword in material_text:
1822
+ # # score += 2.0
1823
+
1824
+ # # # Category matching
1825
+ # # categories_str = ' '.join(material.get('categories', [])).lower()
1826
+ # # for keyword in all_keywords:
1827
+ # # if keyword in categories_str:
1828
+ # # score += 1.0
1829
+
1830
+ # # # Embedding similarity
1831
+ # # material_idx = db.materials.index(material)
1832
+ # # query_embedding = embedding_model.encode([llm_material])
1833
+ # # material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
1834
+ # # semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
1835
+ # # score += semantic_score * 5.0
1836
+
1837
+ # # if score > 0:
1838
+ # # scored_materials.append((material, score))
1839
+
1840
+ # # # Sort and return top K
1841
+ # # scored_materials.sort(key=lambda x: x[1], reverse=True)
1842
+ # # return scored_materials[:top_k]
1843
 
1844
+ # # # ============= VALIDATION PIPELINE =============
 
 
 
 
 
 
 
 
1845
 
1846
+ # # def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
1847
+ # # """Main validation pipeline"""
1848
+ # # validated_areas = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1849
 
1850
+ # # for area_scope in llm_scope.scope_of_work:
1851
+ # # # Match room/area
1852
+ # # matched_room, room_confidence = find_best_room(area_scope.area)
1853
 
1854
+ # # validated_stages_dict = {}
1855
 
1856
+ # # for item in area_scope.items:
1857
+ # # # Match stage
1858
+ # # matched_stage, stage_confidence = find_best_stage(item.stage)
1859
 
1860
+ # # if not matched_stage:
1861
+ # # continue # Skip if stage not found
1862
 
1863
+ # # stage_id = matched_stage['stageId']
1864
 
1865
+ # # # Initialize stage if new
1866
+ # # if stage_id not in validated_stages_dict:
1867
+ # # validated_stages_dict[stage_id] = {
1868
+ # # 'stage_data': matched_stage,
1869
+ # # 'confidence': stage_confidence,
1870
+ # # 'tasks': []
1871
+ # # }
1872
 
1873
+ # # # Match task
1874
+ # # task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
1875
 
1876
+ # # if not task_matches:
1877
+ # # continue
1878
 
1879
+ # # best_task, task_confidence = task_matches[0]
1880
 
1881
+ # # # Match materials
1882
+ # # material_matches = find_materials_for_task(
1883
+ # # best_task,
1884
+ # # item.material,
1885
+ # # item.unit,
1886
+ # # top_k=5
1887
+ # # )
1888
 
1889
+ # # validated_materials = [
1890
+ # # ValidatedMaterial(
1891
+ # # materialId=m['materialId'],
1892
+ # # name=m['name'],
1893
+ # # material=m['material'],
1894
+ # # unit=m['unit'] or 'unit',
1895
+ # # price=float(m['price']),
1896
+ # # margin=float(m['margin']),
1897
+ # # categories=m['categories'],
1898
+ # # confidence_score=round(score / 10.0, 2)
1899
+ # # )
1900
+ # # for m, score in material_matches
1901
+ # # ]
1902
 
1903
+ # # validated_task = ValidatedTask(
1904
+ # # taskId=best_task['taskId'],
1905
+ # # task=best_task['task'],
1906
+ # # displayName=best_task['displayName'],
1907
+ # # unit=best_task['unit'],
1908
+ # # stageId=best_task['stageId'],
1909
+ # # roomArea=best_task['roomArea'],
1910
+ # # confidence_score=round(task_confidence, 2),
1911
+ # # recommended_materials=validated_materials
1912
+ # # )
1913
 
1914
+ # # validated_stages_dict[stage_id]['tasks'].append(validated_task)
1915
 
1916
+ # # # Build validated stages list
1917
+ # # validated_stages = [
1918
+ # # ValidatedStage(
1919
+ # # stageId=stage_data['stage_data']['stageId'],
1920
+ # # stage=stage_data['stage_data']['stage'],
1921
+ # # priority=stage_data['stage_data']['priority'],
1922
+ # # confidence_score=round(stage_data['confidence'], 2),
1923
+ # # tasks=stage_data['tasks']
1924
+ # # )
1925
+ # # for stage_data in validated_stages_dict.values()
1926
+ # # ]
1927
 
1928
+ # # # Sort stages by priority
1929
+ # # validated_stages.sort(key=lambda x: x.priority)
1930
 
1931
+ # # validated_area = ValidatedArea(
1932
+ # # roomId=matched_room['id'] if matched_room else None,
1933
+ # # name=matched_room['name'] if matched_room else area_scope.area,
1934
+ # # roomType=matched_room['roomType'] if matched_room else 'unknown',
1935
+ # # matched=matched_room is not None,
1936
+ # # confidence_score=round(room_confidence, 2),
1937
+ # # stages=validated_stages
1938
+ # # )
1939
 
1940
+ # # validated_areas.append(validated_area)
1941
 
1942
+ # # # Build summary
1943
+ # # summary = {
1944
+ # # 'total_areas': len(validated_areas),
1945
+ # # 'total_stages': sum(len(a.stages) for a in validated_areas),
1946
+ # # 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
1947
+ # # 'total_materials': sum(
1948
+ # # len(t.recommended_materials)
1949
+ # # for a in validated_areas
1950
+ # # for s in a.stages
1951
+ # # for t in s.tasks
1952
+ # # ),
1953
+ # # 'matched_areas': sum(1 for a in validated_areas if a.matched),
1954
+ # # 'avg_confidence': round(
1955
+ # # np.mean([a.confidence_score for a in validated_areas]), 2
1956
+ # # ) if validated_areas else 0.0
1957
+ # # }
1958
 
1959
+ # # return ValidatedResponse(areas=validated_areas, summary=summary)
1960
+
1961
+ # # # ============= API ENDPOINTS =============
1962
+
1963
+ # # @app.get("/")
1964
+ # # async def root():
1965
+ # # return {
1966
+ # # "service": "Construction Scope Validator",
1967
+ # # "version": "1.0.0",
1968
+ # # "status": "running",
1969
+ # # "data_loaded": len(db.stages) > 0,
1970
+ # # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
1971
+ # # }
1972
+
1973
+ # # @app.get("/health")
1974
+ # # async def health():
1975
+ # # return {
1976
+ # # "status": "healthy",
1977
+ # # "stages_loaded": len(db.stages),
1978
+ # # "tasks_loaded": len(db.tasks),
1979
+ # # "materials_loaded": len(db.materials),
1980
+ # # "rooms_loaded": len(db.rooms),
1981
+ # # "embeddings_ready": db.stage_embeddings is not None,
1982
+ # # "model_type": "trained" if os.path.exists('pytorch_model.bin') else "base"
1983
+ # # }
1984
+
1985
+ # # @app.post("/validate", response_model=ValidatedResponse)
1986
+ # # async def validate_scope_endpoint(request: LLMScopeRequest):
1987
+ # # """
1988
+ # # Validate LLM-generated scope against database
1989
 
1990
+ # # Returns enriched data with:
1991
+ # # - Matched stages from DB
1992
+ # # - Matched tasks from DB
1993
+ # # - Recommended materials with pricing
1994
+ # # - Confidence scores for all matches
1995
+ # # """
1996
+ # # try:
1997
+ # # if not db.stages:
1998
+ # # raise HTTPException(status_code=500, detail="Database not loaded")
1999
 
2000
+ # # result = validate_scope(request)
2001
+ # # return result
2002
 
2003
+ # # except Exception as e:
2004
+ # # raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
2005
+
2006
+ # # @app.post("/match-stage")
2007
+ # # async def match_stage(stage_name: str):
2008
+ # # """Test endpoint: match a single stage name"""
2009
+ # # matched_stage, confidence = find_best_stage(stage_name)
2010
+ # # if matched_stage:
2011
+ # # return {
2012
+ # # "input": stage_name,
2013
+ # # "matched": matched_stage,
2014
+ # # "confidence": round(confidence, 2)
2015
+ # # }
2016
+ # # return {"input": stage_name, "matched": None, "confidence": 0.0}
2017
+
2018
+ # # @app.post("/match-room")
2019
+ # # async def match_room(room_name: str):
2020
+ # # """Test endpoint: match a single room name"""
2021
+ # # matched_room, confidence = find_best_room(room_name)
2022
+ # # if matched_room:
2023
+ # # return {
2024
+ # # "input": room_name,
2025
+ # # "matched": matched_room,
2026
+ # # "confidence": round(confidence, 2)
2027
+ # # }
2028
+ # # return {"input": room_name, "matched": None, "confidence": 0.0}
2029
+
2030
+ # # # ============= STARTUP =============
2031
+
2032
+ # # @app.on_event("startup")
2033
+ # # async def startup_event():
2034
+ # # """Load data and initialize embeddings on startup"""
2035
+ # # try:
2036
+ # # print("\n" + "="*60)
2037
+ # # print("STARTING UP...")
2038
+ # # print("="*60)
2039
 
2040
+ # # # Check what files are available
2041
+ # # print("\nFiles in root directory:")
2042
+ # # for file in os.listdir('.'):
2043
+ # # print(f" - {file}")
2044
 
2045
+ # # # Load data
2046
+ # # db.load_data(
2047
+ # # stages_file='stages.json',
2048
+ # # tasks_file='tasks.json',
2049
+ # # materials_file='materials.json',
2050
+ # # rooms_file='rooms.json'
2051
+ # # )
2052
+ # # db.initialize_embeddings()
2053
 
2054
+ # # print("\n" + "="*60)
2055
+ # # print("✅ SERVICE READY!")
2056
+ # # print("="*60)
2057
+ # # except Exception as e:
2058
+ # # print(f"\n❌ STARTUP ERROR: {e}")
2059
+ # # print("Make sure JSON files are in the correct location")
2060
+ # # import traceback
2061
+ # # traceback.print_exc()
2062
+
2063
+ # # if __name__ == "__main__":
2064
+ # # import uvicorn
2065
+ # # uvicorn.run(app, host="0.0.0.0", port=7860)