mlbench123 commited on
Commit
2a631c5
·
verified ·
1 Parent(s): 299a54e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +454 -0
app.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Service for Construction Scope Validation
3
+ Deploy on Hugging Face Spaces
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field
9
+ from typing import List, Optional, Dict, Any
10
+ import json
11
+ import numpy as np
12
+ from sentence_transformers import SentenceTransformer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import re
15
+
16
+ app = FastAPI(
17
+ title="Construction Scope Validator API",
18
+ description="Validates and enriches LLM-generated construction scope with DB data",
19
+ version="1.0.0"
20
+ )
21
+
22
+ # CORS middleware
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ # Load embedding model (cached globally)
32
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
33
+
34
+ # ============= DATA MODELS =============
35
+
36
+ class LLMScopeItem(BaseModel):
37
+ stage: str
38
+ task: str
39
+ material: str
40
+ quantity: float
41
+ unit: str
42
+
43
+ class LLMAreaScope(BaseModel):
44
+ area: str
45
+ items: List[LLMScopeItem]
46
+
47
+ class LLMScopeRequest(BaseModel):
48
+ scope_of_work: List[LLMAreaScope]
49
+
50
+ class ValidatedMaterial(BaseModel):
51
+ materialId: int
52
+ name: str
53
+ material: str
54
+ unit: str
55
+ price: float
56
+ margin: float
57
+ categories: List[str]
58
+ confidence_score: float
59
+
60
+ class ValidatedTask(BaseModel):
61
+ taskId: int
62
+ task: str
63
+ displayName: str
64
+ unit: str
65
+ stageId: int
66
+ roomArea: List[str]
67
+ confidence_score: float
68
+ recommended_materials: List[ValidatedMaterial]
69
+
70
+ class ValidatedStage(BaseModel):
71
+ stageId: int
72
+ stage: str
73
+ priority: int
74
+ confidence_score: float
75
+ tasks: List[ValidatedTask]
76
+
77
+ class ValidatedArea(BaseModel):
78
+ roomId: Optional[int]
79
+ name: str
80
+ roomType: str
81
+ matched: bool
82
+ confidence_score: float
83
+ stages: List[ValidatedStage]
84
+
85
+ class ValidatedResponse(BaseModel):
86
+ areas: List[ValidatedArea]
87
+ summary: Dict[str, Any]
88
+
89
+ # ============= DATABASE LOADERS =============
90
+
91
+ class DatabaseLoader:
92
+ def __init__(self):
93
+ self.stages = []
94
+ self.tasks = []
95
+ self.materials = []
96
+ self.rooms = []
97
+ self.stage_embeddings = None
98
+ self.task_embeddings = None
99
+ self.material_embeddings = None
100
+
101
+ def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
102
+ """Load JSON data files"""
103
+ with open(stages_file, 'r') as f:
104
+ self.stages = [json.loads(line) for line in f if line.strip()]
105
+
106
+ with open(tasks_file, 'r') as f:
107
+ self.tasks = [json.loads(line) for line in f if line.strip()]
108
+
109
+ with open(materials_file, 'r') as f:
110
+ self.materials = [json.loads(line) for line in f if line.strip()]
111
+
112
+ with open(rooms_file, 'r') as f:
113
+ self.rooms = [json.loads(line) for line in f if line.strip()]
114
+
115
+ print(f"Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
116
+ f"{len(self.materials)} materials, {len(self.rooms)} rooms")
117
+
118
+ def initialize_embeddings(self):
119
+ """Pre-compute embeddings for fast lookup"""
120
+ print("Computing stage embeddings...")
121
+ stage_texts = [s['stage'] for s in self.stages]
122
+ self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
123
+
124
+ print("Computing task embeddings...")
125
+ task_texts = [t['task'] for t in self.tasks]
126
+ self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
127
+
128
+ print("Computing material embeddings...")
129
+ material_texts = [m['material'] for m in self.materials]
130
+ self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
131
+
132
+ print("Embeddings ready!")
133
+
134
+ # Global DB instance
135
+ db = DatabaseLoader()
136
+
137
+ # ============= MATCHING FUNCTIONS =============
138
+
139
+ def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
140
+ """Find closest matching stage from DB"""
141
+ query_embedding = embedding_model.encode([llm_stage])
142
+ similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
143
+
144
+ best_idx = np.argmax(similarities)
145
+ best_score = similarities[best_idx]
146
+
147
+ if best_score >= threshold:
148
+ return db.stages[best_idx], best_score
149
+ return None, 0.0
150
+
151
+ def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
152
+ """Find closest matching room from DB"""
153
+ llm_area_lower = llm_area.lower()
154
+
155
+ # Exact match first
156
+ for room in db.rooms:
157
+ if room['name'].lower() == llm_area_lower:
158
+ return room, 1.0
159
+
160
+ # Fuzzy match
161
+ room_texts = [r['name'] for r in db.rooms]
162
+ query_embedding = embedding_model.encode([llm_area])
163
+ room_embeddings = embedding_model.encode(room_texts)
164
+ similarities = cosine_similarity(query_embedding, room_embeddings)[0]
165
+
166
+ best_idx = np.argmax(similarities)
167
+ best_score = similarities[best_idx]
168
+
169
+ if best_score >= threshold:
170
+ return db.rooms[best_idx], best_score
171
+ return None, 0.0
172
+
173
+ def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
174
+ """Find relevant tasks for a stage matching LLM task description"""
175
+ # Filter tasks by stage
176
+ stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
177
+
178
+ if not stage_tasks:
179
+ return []
180
+
181
+ # Compute similarities
182
+ task_indices = [db.tasks.index(t) for t in stage_tasks]
183
+ query_embedding = embedding_model.encode([llm_task])
184
+
185
+ stage_task_embeddings = db.task_embeddings[task_indices]
186
+ similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
187
+
188
+ # Get top K
189
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
190
+ results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
191
+
192
+ return results
193
+
194
+ def extract_keywords(text: str) -> List[str]:
195
+ """Extract meaningful keywords from text"""
196
+ # Remove common words
197
+ stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
198
+ words = re.findall(r'\b\w+\b', text.lower())
199
+ return [w for w in words if w not in stop_words and len(w) > 2]
200
+
201
+ def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
202
+ """Find materials matching task requirements"""
203
+ task_keywords = extract_keywords(task['task'])
204
+ llm_keywords = extract_keywords(llm_material)
205
+ all_keywords = set(task_keywords + llm_keywords)
206
+
207
+ # Filter by unit compatibility
208
+ compatible_materials = [
209
+ m for m in db.materials
210
+ if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
211
+ ]
212
+
213
+ if not compatible_materials:
214
+ # Fallback: allow any unit
215
+ compatible_materials = db.materials
216
+
217
+ # Score materials
218
+ scored_materials = []
219
+ for material in compatible_materials:
220
+ score = 0.0
221
+ material_text = material['material'].lower()
222
+
223
+ # Keyword matching
224
+ for keyword in all_keywords:
225
+ if keyword in material_text:
226
+ score += 2.0
227
+
228
+ # Category matching
229
+ categories_str = ' '.join(material.get('categories', [])).lower()
230
+ for keyword in all_keywords:
231
+ if keyword in categories_str:
232
+ score += 1.0
233
+
234
+ # Embedding similarity
235
+ material_idx = db.materials.index(material)
236
+ query_embedding = embedding_model.encode([llm_material])
237
+ material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
238
+ semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
239
+ score += semantic_score * 5.0
240
+
241
+ if score > 0:
242
+ scored_materials.append((material, score))
243
+
244
+ # Sort and return top K
245
+ scored_materials.sort(key=lambda x: x[1], reverse=True)
246
+ return scored_materials[:top_k]
247
+
248
+ # ============= VALIDATION PIPELINE =============
249
+
250
+ def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
251
+ """Main validation pipeline"""
252
+ validated_areas = []
253
+
254
+ for area_scope in llm_scope.scope_of_work:
255
+ # Match room/area
256
+ matched_room, room_confidence = find_best_room(area_scope.area)
257
+
258
+ validated_stages_dict = {}
259
+
260
+ for item in area_scope.items:
261
+ # Match stage
262
+ matched_stage, stage_confidence = find_best_stage(item.stage)
263
+
264
+ if not matched_stage:
265
+ continue # Skip if stage not found
266
+
267
+ stage_id = matched_stage['stageId']
268
+
269
+ # Initialize stage if new
270
+ if stage_id not in validated_stages_dict:
271
+ validated_stages_dict[stage_id] = {
272
+ 'stage_data': matched_stage,
273
+ 'confidence': stage_confidence,
274
+ 'tasks': []
275
+ }
276
+
277
+ # Match task
278
+ task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
279
+
280
+ if not task_matches:
281
+ continue
282
+
283
+ best_task, task_confidence = task_matches[0]
284
+
285
+ # Match materials
286
+ material_matches = find_materials_for_task(
287
+ best_task,
288
+ item.material,
289
+ item.unit,
290
+ top_k=5
291
+ )
292
+
293
+ validated_materials = [
294
+ ValidatedMaterial(
295
+ materialId=m['materialId'],
296
+ name=m['name'],
297
+ material=m['material'],
298
+ unit=m['unit'] or 'unit',
299
+ price=float(m['price']),
300
+ margin=float(m['margin']),
301
+ categories=m['categories'],
302
+ confidence_score=round(score / 10.0, 2)
303
+ )
304
+ for m, score in material_matches
305
+ ]
306
+
307
+ validated_task = ValidatedTask(
308
+ taskId=best_task['taskId'],
309
+ task=best_task['task'],
310
+ displayName=best_task['displayName'],
311
+ unit=best_task['unit'],
312
+ stageId=best_task['stageId'],
313
+ roomArea=best_task['roomArea'],
314
+ confidence_score=round(task_confidence, 2),
315
+ recommended_materials=validated_materials
316
+ )
317
+
318
+ validated_stages_dict[stage_id]['tasks'].append(validated_task)
319
+
320
+ # Build validated stages list
321
+ validated_stages = [
322
+ ValidatedStage(
323
+ stageId=stage_data['stage_data']['stageId'],
324
+ stage=stage_data['stage_data']['stage'],
325
+ priority=stage_data['stage_data']['priority'],
326
+ confidence_score=round(stage_data['confidence'], 2),
327
+ tasks=stage_data['tasks']
328
+ )
329
+ for stage_data in validated_stages_dict.values()
330
+ ]
331
+
332
+ # Sort stages by priority
333
+ validated_stages.sort(key=lambda x: x.priority)
334
+
335
+ validated_area = ValidatedArea(
336
+ roomId=matched_room['id'] if matched_room else None,
337
+ name=matched_room['name'] if matched_room else area_scope.area,
338
+ roomType=matched_room['roomType'] if matched_room else 'unknown',
339
+ matched=matched_room is not None,
340
+ confidence_score=round(room_confidence, 2),
341
+ stages=validated_stages
342
+ )
343
+
344
+ validated_areas.append(validated_area)
345
+
346
+ # Build summary
347
+ summary = {
348
+ 'total_areas': len(validated_areas),
349
+ 'total_stages': sum(len(a.stages) for a in validated_areas),
350
+ 'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
351
+ 'total_materials': sum(
352
+ len(t.recommended_materials)
353
+ for a in validated_areas
354
+ for s in a.stages
355
+ for t in s.tasks
356
+ ),
357
+ 'matched_areas': sum(1 for a in validated_areas if a.matched),
358
+ 'avg_confidence': round(
359
+ np.mean([a.confidence_score for a in validated_areas]), 2
360
+ ) if validated_areas else 0.0
361
+ }
362
+
363
+ return ValidatedResponse(areas=validated_areas, summary=summary)
364
+
365
+ # ============= API ENDPOINTS =============
366
+
367
+ @app.get("/")
368
+ async def root():
369
+ return {
370
+ "service": "Construction Scope Validator",
371
+ "version": "1.0.0",
372
+ "status": "running",
373
+ "data_loaded": len(db.stages) > 0
374
+ }
375
+
376
+ @app.get("/health")
377
+ async def health():
378
+ return {
379
+ "status": "healthy",
380
+ "stages_loaded": len(db.stages),
381
+ "tasks_loaded": len(db.tasks),
382
+ "materials_loaded": len(db.materials),
383
+ "rooms_loaded": len(db.rooms),
384
+ "embeddings_ready": db.stage_embeddings is not None
385
+ }
386
+
387
+ @app.post("/validate", response_model=ValidatedResponse)
388
+ async def validate_scope_endpoint(request: LLMScopeRequest):
389
+ """
390
+ Validate LLM-generated scope against database
391
+
392
+ Returns enriched data with:
393
+ - Matched stages from DB
394
+ - Matched tasks from DB
395
+ - Recommended materials with pricing
396
+ - Confidence scores for all matches
397
+ """
398
+ try:
399
+ if not db.stages:
400
+ raise HTTPException(status_code=500, detail="Database not loaded")
401
+
402
+ result = validate_scope(request)
403
+ return result
404
+
405
+ except Exception as e:
406
+ raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
407
+
408
+ @app.post("/match-stage")
409
+ async def match_stage(stage_name: str):
410
+ """Test endpoint: match a single stage name"""
411
+ matched_stage, confidence = find_best_stage(stage_name)
412
+ if matched_stage:
413
+ return {
414
+ "input": stage_name,
415
+ "matched": matched_stage,
416
+ "confidence": round(confidence, 2)
417
+ }
418
+ return {"input": stage_name, "matched": None, "confidence": 0.0}
419
+
420
+ @app.post("/match-room")
421
+ async def match_room(room_name: str):
422
+ """Test endpoint: match a single room name"""
423
+ matched_room, confidence = find_best_room(room_name)
424
+ if matched_room:
425
+ return {
426
+ "input": room_name,
427
+ "matched": matched_room,
428
+ "confidence": round(confidence, 2)
429
+ }
430
+ return {"input": room_name, "matched": None, "confidence": 0.0}
431
+
432
+ # ============= STARTUP =============
433
+
434
+ @app.on_event("startup")
435
+ async def startup_event():
436
+ """Load data and initialize embeddings on startup"""
437
+ try:
438
+ # In production, load from mounted volumes or environment
439
+ # For Hugging Face Spaces, put JSON files in the repo root
440
+ db.load_data(
441
+ stages_file='stages.json',
442
+ tasks_file='tasks.json',
443
+ materials_file='materials.json',
444
+ rooms_file='rooms.json'
445
+ )
446
+ db.initialize_embeddings()
447
+ print("✅ Service ready!")
448
+ except Exception as e:
449
+ print(f"❌ Startup error: {e}")
450
+ print("Make sure JSON files are in the correct location")
451
+
452
+ if __name__ == "__main__":
453
+ import uvicorn
454
+ uvicorn.run(app, host="0.0.0.0", port=7860)