Spaces:

mlbench123
/

Sudoco_ScopeOfWork

Sleeping

App Files Files Community

mlbench123 commited on Jan 2

Commit

2a631c5

verified ·

1 Parent(s): 299a54e

Create app.py

Browse files

Files changed (1) hide show

app.py +454 -0

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+"""
+FastAPI Service for Construction Scope Validation
+Deploy on Hugging Face Spaces
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+import json
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import re
+app = FastAPI(
+    title="Construction Scope Validator API",
+    description="Validates and enriches LLM-generated construction scope with DB data",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load embedding model (cached globally)
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# ============= DATA MODELS =============
+class LLMScopeItem(BaseModel):
+    stage: str
+    task: str
+    material: str
+    quantity: float
+    unit: str
+class LLMAreaScope(BaseModel):
+    area: str
+    items: List[LLMScopeItem]
+class LLMScopeRequest(BaseModel):
+    scope_of_work: List[LLMAreaScope]
+class ValidatedMaterial(BaseModel):
+    materialId: int
+    name: str
+    material: str
+    unit: str
+    price: float
+    margin: float
+    categories: List[str]
+    confidence_score: float
+class ValidatedTask(BaseModel):
+    taskId: int
+    task: str
+    displayName: str
+    unit: str
+    stageId: int
+    roomArea: List[str]
+    confidence_score: float
+    recommended_materials: List[ValidatedMaterial]
+class ValidatedStage(BaseModel):
+    stageId: int
+    stage: str
+    priority: int
+    confidence_score: float
+    tasks: List[ValidatedTask]
+class ValidatedArea(BaseModel):
+    roomId: Optional[int]
+    name: str
+    roomType: str
+    matched: bool
+    confidence_score: float
+    stages: List[ValidatedStage]
+class ValidatedResponse(BaseModel):
+    areas: List[ValidatedArea]
+    summary: Dict[str, Any]
+# ============= DATABASE LOADERS =============
+class DatabaseLoader:
+    def __init__(self):
+        self.stages = []
+        self.tasks = []
+        self.materials = []
+        self.rooms = []
+        self.stage_embeddings = None
+        self.task_embeddings = None
+        self.material_embeddings = None
+    def load_data(self, stages_file: str, tasks_file: str, materials_file: str, rooms_file: str):
+        """Load JSON data files"""
+        with open(stages_file, 'r') as f:
+            self.stages = [json.loads(line) for line in f if line.strip()]
+        with open(tasks_file, 'r') as f:
+            self.tasks = [json.loads(line) for line in f if line.strip()]
+        with open(materials_file, 'r') as f:
+            self.materials = [json.loads(line) for line in f if line.strip()]
+        with open(rooms_file, 'r') as f:
+            self.rooms = [json.loads(line) for line in f if line.strip()]
+        print(f"Loaded: {len(self.stages)} stages, {len(self.tasks)} tasks, "
+              f"{len(self.materials)} materials, {len(self.rooms)} rooms")
+    def initialize_embeddings(self):
+        """Pre-compute embeddings for fast lookup"""
+        print("Computing stage embeddings...")
+        stage_texts = [s['stage'] for s in self.stages]
+        self.stage_embeddings = embedding_model.encode(stage_texts, show_progress_bar=True)
+        print("Computing task embeddings...")
+        task_texts = [t['task'] for t in self.tasks]
+        self.task_embeddings = embedding_model.encode(task_texts, show_progress_bar=True)
+        print("Computing material embeddings...")
+        material_texts = [m['material'] for m in self.materials]
+        self.material_embeddings = embedding_model.encode(material_texts, show_progress_bar=True)
+        print("Embeddings ready!")
+# Global DB instance
+db = DatabaseLoader()
+# ============= MATCHING FUNCTIONS =============
+def find_best_stage(llm_stage: str, threshold: float = 0.5) -> tuple:
+    """Find closest matching stage from DB"""
+    query_embedding = embedding_model.encode([llm_stage])
+    similarities = cosine_similarity(query_embedding, db.stage_embeddings)[0]
+    best_idx = np.argmax(similarities)
+    best_score = similarities[best_idx]
+    if best_score >= threshold:
+        return db.stages[best_idx], best_score
+    return None, 0.0
+def find_best_room(llm_area: str, threshold: float = 0.6) -> tuple:
+    """Find closest matching room from DB"""
+    llm_area_lower = llm_area.lower()
+    # Exact match first
+    for room in db.rooms:
+        if room['name'].lower() == llm_area_lower:
+            return room, 1.0
+    # Fuzzy match
+    room_texts = [r['name'] for r in db.rooms]
+    query_embedding = embedding_model.encode([llm_area])
+    room_embeddings = embedding_model.encode(room_texts)
+    similarities = cosine_similarity(query_embedding, room_embeddings)[0]
+    best_idx = np.argmax(similarities)
+    best_score = similarities[best_idx]
+    if best_score >= threshold:
+        return db.rooms[best_idx], best_score
+    return None, 0.0
+def find_tasks_for_stage(stage_id: int, llm_task: str, top_k: int = 5) -> List[tuple]:
+    """Find relevant tasks for a stage matching LLM task description"""
+    # Filter tasks by stage
+    stage_tasks = [t for t in db.tasks if t['stageId'] == stage_id]
+    if not stage_tasks:
+        return []
+    # Compute similarities
+    task_indices = [db.tasks.index(t) for t in stage_tasks]
+    query_embedding = embedding_model.encode([llm_task])
+    stage_task_embeddings = db.task_embeddings[task_indices]
+    similarities = cosine_similarity(query_embedding, stage_task_embeddings)[0]
+    # Get top K
+    top_indices = np.argsort(similarities)[-top_k:][::-1]
+    results = [(stage_tasks[idx], similarities[idx]) for idx in top_indices]
+    return results
+def extract_keywords(text: str) -> List[str]:
+    """Extract meaningful keywords from text"""
+    # Remove common words
+    stop_words = {'and', 'or', 'the', 'to', 'a', 'of', 'for', 'in', 'on', 'supply', 'install'}
+    words = re.findall(r'\b\w+\b', text.lower())
+    return [w for w in words if w not in stop_words and len(w) > 2]
+def find_materials_for_task(task: dict, llm_material: str, unit: str, top_k: int = 10) -> List[tuple]:
+    """Find materials matching task requirements"""
+    task_keywords = extract_keywords(task['task'])
+    llm_keywords = extract_keywords(llm_material)
+    all_keywords = set(task_keywords + llm_keywords)
+    # Filter by unit compatibility
+    compatible_materials = [
+        m for m in db.materials
+        if m['unit'] == unit or m['unit'] == 'unit' or m['unit'] is None
+    ]
+    if not compatible_materials:
+        # Fallback: allow any unit
+        compatible_materials = db.materials
+    # Score materials
+    scored_materials = []
+    for material in compatible_materials:
+        score = 0.0
+        material_text = material['material'].lower()
+        # Keyword matching
+        for keyword in all_keywords:
+            if keyword in material_text:
+                score += 2.0
+        # Category matching
+        categories_str = ' '.join(material.get('categories', [])).lower()
+        for keyword in all_keywords:
+            if keyword in categories_str:
+                score += 1.0
+        # Embedding similarity
+        material_idx = db.materials.index(material)
+        query_embedding = embedding_model.encode([llm_material])
+        material_embedding = db.material_embeddings[material_idx].reshape(1, -1)
+        semantic_score = cosine_similarity(query_embedding, material_embedding)[0][0]
+        score += semantic_score * 5.0
+        if score > 0:
+            scored_materials.append((material, score))
+    # Sort and return top K
+    scored_materials.sort(key=lambda x: x[1], reverse=True)
+    return scored_materials[:top_k]
+# ============= VALIDATION PIPELINE =============
+def validate_scope(llm_scope: LLMScopeRequest) -> ValidatedResponse:
+    """Main validation pipeline"""
+    validated_areas = []
+    for area_scope in llm_scope.scope_of_work:
+        # Match room/area
+        matched_room, room_confidence = find_best_room(area_scope.area)
+        validated_stages_dict = {}
+        for item in area_scope.items:
+            # Match stage
+            matched_stage, stage_confidence = find_best_stage(item.stage)
+            if not matched_stage:
+                continue  # Skip if stage not found
+            stage_id = matched_stage['stageId']
+            # Initialize stage if new
+            if stage_id not in validated_stages_dict:
+                validated_stages_dict[stage_id] = {
+                    'stage_data': matched_stage,
+                    'confidence': stage_confidence,
+                    'tasks': []
+                }
+            # Match task
+            task_matches = find_tasks_for_stage(stage_id, item.task, top_k=3)
+            if not task_matches:
+                continue
+            best_task, task_confidence = task_matches[0]
+            # Match materials
+            material_matches = find_materials_for_task(
+                best_task,
+                item.material,
+                item.unit,
+                top_k=5
+            )
+            validated_materials = [
+                ValidatedMaterial(
+                    materialId=m['materialId'],
+                    name=m['name'],
+                    material=m['material'],
+                    unit=m['unit'] or 'unit',
+                    price=float(m['price']),
+                    margin=float(m['margin']),
+                    categories=m['categories'],
+                    confidence_score=round(score / 10.0, 2)
+                )
+                for m, score in material_matches
+            ]
+            validated_task = ValidatedTask(
+                taskId=best_task['taskId'],
+                task=best_task['task'],
+                displayName=best_task['displayName'],
+                unit=best_task['unit'],
+                stageId=best_task['stageId'],
+                roomArea=best_task['roomArea'],
+                confidence_score=round(task_confidence, 2),
+                recommended_materials=validated_materials
+            )
+            validated_stages_dict[stage_id]['tasks'].append(validated_task)
+        # Build validated stages list
+        validated_stages = [
+            ValidatedStage(
+                stageId=stage_data['stage_data']['stageId'],
+                stage=stage_data['stage_data']['stage'],
+                priority=stage_data['stage_data']['priority'],
+                confidence_score=round(stage_data['confidence'], 2),
+                tasks=stage_data['tasks']
+            )
+            for stage_data in validated_stages_dict.values()
+        ]
+        # Sort stages by priority
+        validated_stages.sort(key=lambda x: x.priority)
+        validated_area = ValidatedArea(
+            roomId=matched_room['id'] if matched_room else None,
+            name=matched_room['name'] if matched_room else area_scope.area,
+            roomType=matched_room['roomType'] if matched_room else 'unknown',
+            matched=matched_room is not None,
+            confidence_score=round(room_confidence, 2),
+            stages=validated_stages
+        )
+        validated_areas.append(validated_area)
+    # Build summary
+    summary = {
+        'total_areas': len(validated_areas),
+        'total_stages': sum(len(a.stages) for a in validated_areas),
+        'total_tasks': sum(len(s.tasks) for a in validated_areas for s in a.stages),
+        'total_materials': sum(
+            len(t.recommended_materials)
+            for a in validated_areas
+            for s in a.stages
+            for t in s.tasks
+        ),
+        'matched_areas': sum(1 for a in validated_areas if a.matched),
+        'avg_confidence': round(
+            np.mean([a.confidence_score for a in validated_areas]), 2
+        ) if validated_areas else 0.0
+    }
+    return ValidatedResponse(areas=validated_areas, summary=summary)
+# ============= API ENDPOINTS =============
+@app.get("/")
+async def root():
+    return {
+        "service": "Construction Scope Validator",
+        "version": "1.0.0",
+        "status": "running",
+        "data_loaded": len(db.stages) > 0
+    }
+@app.get("/health")
+async def health():
+    return {
+        "status": "healthy",
+        "stages_loaded": len(db.stages),
+        "tasks_loaded": len(db.tasks),
+        "materials_loaded": len(db.materials),
+        "rooms_loaded": len(db.rooms),
+        "embeddings_ready": db.stage_embeddings is not None
+    }
+@app.post("/validate", response_model=ValidatedResponse)
+async def validate_scope_endpoint(request: LLMScopeRequest):
+    """
+    Validate LLM-generated scope against database
+    Returns enriched data with:
+    - Matched stages from DB
+    - Matched tasks from DB
+    - Recommended materials with pricing
+    - Confidence scores for all matches
+    """
+    try:
+        if not db.stages:
+            raise HTTPException(status_code=500, detail="Database not loaded")
+        result = validate_scope(request)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Validation error: {str(e)}")
+@app.post("/match-stage")
+async def match_stage(stage_name: str):
+    """Test endpoint: match a single stage name"""
+    matched_stage, confidence = find_best_stage(stage_name)
+    if matched_stage:
+        return {
+            "input": stage_name,
+            "matched": matched_stage,
+            "confidence": round(confidence, 2)
+        }
+    return {"input": stage_name, "matched": None, "confidence": 0.0}
+@app.post("/match-room")
+async def match_room(room_name: str):
+    """Test endpoint: match a single room name"""
+    matched_room, confidence = find_best_room(room_name)
+    if matched_room:
+        return {
+            "input": room_name,
+            "matched": matched_room,
+            "confidence": round(confidence, 2)
+        }
+    return {"input": room_name, "matched": None, "confidence": 0.0}
+# ============= STARTUP =============
+@app.on_event("startup")
+async def startup_event():
+    """Load data and initialize embeddings on startup"""
+    try:
+        # In production, load from mounted volumes or environment
+        # For Hugging Face Spaces, put JSON files in the repo root
+        db.load_data(
+            stages_file='stages.json',
+            tasks_file='tasks.json',
+            materials_file='materials.json',
+            rooms_file='rooms.json'
+        )
+        db.initialize_embeddings()
+        print("✅ Service ready!")
+    except Exception as e:
+        print(f"❌ Startup error: {e}")
+        print("Make sure JSON files are in the correct location")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)