Spaces:

lablab-ai-amd-developer-hackathon
/

ForgeSight

Sleeping

App Files Files Community

rasAli02 commited on 17 days ago

Commit

307f1c5

1 Parent(s): a0c4b2d

git add, commit, push

Browse files

Files changed (5) hide show

backend/agents.py +87 -40
backend/amd_hackathon/fine-tune.yaml +6 -6
backend/amd_hackathon/generate_dataset.py +16 -16
backend/amd_hackathon/track3_vision.py +116 -0
backend/amd_hackathon/verify_env.py +58 -0

backend/agents.py CHANGED Viewed

@@ -14,6 +14,14 @@ from typing import Optional, List, Dict, Any
 EMERGENT_LLM_KEY = os.environ.get("EMERGENT_LLM_KEY", "")
 # Model choices — Claude Sonnet 4.5 is vision-capable and strong for reasoning.
 VISION_MODEL = ("anthropic", "claude-sonnet-4-5-20250929")
 TEXT_MODEL = ("anthropic", "claude-sonnet-4-5-20250929")
@@ -91,6 +99,66 @@ def _extract_json(raw: str) -> Dict[str, Any]:
     return {"_raw": raw}
 async def _run_agent(
     name: str,
     system_message: str,
@@ -98,47 +166,26 @@ async def _run_agent(
     image_base64: Optional[str] = None,
     provider_model: tuple = TEXT_MODEL,
 ) -> Dict[str, Any]:
-    # MOCKED for preview
     import asyncio
-    await asyncio.sleep(0.5)
-    parsed = {}
-    if name == "inspector":
-        parsed = {
-            "verdict": "warn",
-            "confidence": 0.85,
-            "defects": [
-                {"type": "surface-scratch", "severity": "low", "location": "top-left edge", "description": "Minor scratch visible"}
-            ],
-            "observation": "I can see a minor scratch on the surface."
-        }
-    elif name == "diagnostician":
-        parsed = {
-            "probable_cause": "Improper handling during milling.",
-            "contributing_factors": ["Machine calibration", "Operator error"],
-            "affected_process_step": "CNC milling"
-        }
-    elif name == "action":
-        parsed = {
-            "priority": "P2",
-            "assignee_role": "quality-engineer",
-            "steps": ["Inspect machine", "Recalibrate"],
-            "estimated_minutes": 30,
-            "parts_or_tools": ["Calibration kit"]
-        }
-    elif name == "reporter":
-        parsed = {
-            "headline": "Minor Scratch Detected",
-            "summary": "A minor scratch was detected during the milling process.",
-            "tags": ["scratch", "milling"]
-        }
-    elif name == "social":
-        parsed = {
-            "x_post": "Testing our pipeline #AMDHackathon",
-            "linkedin_post": "We are testing our pipeline today..."
-        }
-    return {"raw": json.dumps(parsed), "parsed": parsed}
 async def run_pipeline(

 EMERGENT_LLM_KEY = os.environ.get("EMERGENT_LLM_KEY", "")
+# AMD MI300X inference server (fine-tuned compliance model)
+# Jupyter proxy route used since direct port 8000 is firewalled.
+# Override with AMD_INFERENCE_URL env var if direct access is available.
+AMD_INFERENCE_URL = os.environ.get(
+    "AMD_INFERENCE_URL",
+    "http://129.212.191.163/proxy/8000"
+)
 # Model choices — Claude Sonnet 4.5 is vision-capable and strong for reasoning.
 VISION_MODEL = ("anthropic", "claude-sonnet-4-5-20250929")
 TEXT_MODEL = ("anthropic", "claude-sonnet-4-5-20250929")
     return {"_raw": raw}
+def _build_prompt(system_message: str, user_text: str) -> str:
+    return f"<|system|>{system_message}<|user|>{user_text}<|assistant|>"
+def _mock_response(name: str) -> Dict[str, Any]:
+    """Fallback mock responses for local development (AMD server not running)."""
+    mocks = {
+        "inspector": {
+            "verdict": "warn", "confidence": 0.85,
+            "defects": [{"type": "surface-scratch", "severity": "low", "location": "top-left edge", "description": "Minor scratch visible"}],
+            "observation": "Minor scratch detected on surface. [LOCAL MOCK — AMD server offline]"
+        },
+        "diagnostician": {
+            "probable_cause": "Improper handling during milling. [LOCAL MOCK]",
+            "contributing_factors": ["Machine calibration", "Operator error"],
+            "affected_process_step": "CNC milling"
+        },
+        "action": {
+            "priority": "P2", "assignee_role": "quality-engineer",
+            "steps": ["Inspect machine", "Recalibrate"],
+            "estimated_minutes": 30, "parts_or_tools": ["Calibration kit"]
+        },
+        "reporter": {
+            "headline": "Minor Scratch Detected [Mock]",
+            "summary": "Local mock response — start the AMD inference server to use the fine-tuned compliance model.",
+            "tags": ["scratch", "mock", "local"]
+        },
+        "social": {
+            "x_post": "Testing our pipeline #AMDHackathon",
+            "linkedin_post": "We are testing our pipeline today..."
+        },
+    }
+    parsed = mocks.get(name, {})
+    return {"raw": json.dumps(parsed), "parsed": parsed, "source": "mock"}
+async def _call_amd_server(prompt: str) -> Optional[str]:
+    """Call the fine-tuned model running on AMD MI300X. Returns None if unreachable."""
+    import asyncio
+    import urllib.request
+    import urllib.error
+    payload = json.dumps({"prompt": prompt, "max_tokens": 512}).encode()
+    req = urllib.request.Request(
+        f"{AMD_INFERENCE_URL}/v1/complete",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        loop = asyncio.get_event_loop()
+        def _do_request():
+            with urllib.request.urlopen(req, timeout=10) as resp:
+                return json.loads(resp.read())
+        result = await loop.run_in_executor(None, _do_request)
+        return result.get("text", "")
+    except Exception:
+        return None  # Server offline — caller will use mock
 async def _run_agent(
     name: str,
     system_message: str,
     image_base64: Optional[str] = None,
     provider_model: tuple = TEXT_MODEL,
 ) -> Dict[str, Any]:
+    """
+    Run an agent. Tries the AMD MI300X fine-tuned model first.
+    Falls back to mock responses automatically if the server is not running
+    (e.g. local development without the AMD instance active).
+    """
     import asyncio
+    await asyncio.sleep(0.1)
+    prompt = _build_prompt(system_message, user_text)
+    raw_text = await _call_amd_server(prompt)
+    if raw_text is None:
+        # AMD server not reachable — use local mock (safe for dev)
+        result = _mock_response(name)
+        result["source"] = "mock (AMD server offline)"
+        return result
+    # AMD server responded — parse its JSON output
+    parsed = _extract_json(raw_text)
+    return {"raw": raw_text, "parsed": parsed, "source": f"AMD MI300X @ {AMD_INFERENCE_URL}"}
 async def run_pipeline(

backend/amd_hackathon/fine-tune.yaml CHANGED Viewed

@@ -1,25 +1,25 @@
-base_model: Qwen/Qwen2.5-32B-Instruct
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-# Target Domain: Technical Engineering Specs (Track 2 Focus)
 load_in_4bit: true
 strict: false
 datasets:
-  - path: data/engineering_specs_synthetic.jsonl
     type: chat_template
     chat_template: chatml
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./qwen2.5-32b-engineering-lora
 # QLoRA Adapter Configuration
 adapter: qlora
 lora_model_dir:
-sequence_len: 8192 # Increased sequence length for complex specs
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
@@ -30,7 +30,7 @@ lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: rocm-qwen32b-engineering
 wandb_entity:
 wandb_watch:
 wandb_name: mi300x-run-1

+base_model: Qwen/Qwen3.6-35B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
+# Target Domain: Infrastructure Compliance
 load_in_4bit: true
 strict: false
 datasets:
+  - path: data/infrastructure_compliance.jsonl
     type: chat_template
     chat_template: chatml
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
+output_dir: ./qwen3.6-35b-compliance-lora
 # QLoRA Adapter Configuration
 adapter: qlora
 lora_model_dir:
+sequence_len: 32768 # 32k context window for long technical documents
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 lora_target_linear: true
 lora_fan_in_fan_out:
+wandb_project: rocm-qwen35b-compliance
 wandb_entity:
 wandb_watch:
 wandb_name: mi300x-run-1

backend/amd_hackathon/generate_dataset.py CHANGED Viewed

@@ -5,35 +5,35 @@ from pathlib import Path
 # We will save the data to the 'data' directory relative to where the script is run
 OUTPUT_DIR = Path("data")
-OUTPUT_FILE = OUTPUT_DIR / "engineering_specs_synthetic.jsonl"
-# Sample synthetic data templates focused on Technical Engineering Specs
-SYSTEM_PROMPT = "You are an expert industrial diagnostic AI. Given an engineering defect report or visual observation, you must reference technical specifications and provide a highly accurate diagnostic verdict, severity, and remediation steps."
 SYNTHETIC_SAMPLES = [
     {
-        "user": "I am looking at a steel beam (Grade A36) with visible surface micro-fractures near the weld joint. The load bearing requirement is 50 kN.",
-        "assistant": "According to structural engineering standard AWS D1.1, micro-fractures near a weld joint on ASTM A36 steel indicate potential hydrogen-induced cracking or rapid cooling stress. Given the 50 kN load requirement, this poses a critical structural risk. \n\n**Verdict**: FAIL\n**Severity**: CRITICAL\n**Remediation**: Immediate halt of load bearing. Recommend ultrasonic testing (UT) to determine crack depth and subsequent re-welding using low-hydrogen electrodes."
     },
     {
-        "user": "The printed circuit board (PCB) shows solder bridging between pins 4 and 5 of the main microcontroller (STM32).",
-        "assistant": "Solder bridging between pins 4 and 5 constitutes a short circuit, violating IPC-A-610 Class 3 standards for high-reliability electronic assemblies. \n\n**Verdict**: FAIL\n**Severity**: HIGH\n**Remediation**: Rework required. Apply flux and use a desoldering braid or hot air rework station to remove the excess solder. Inspect adjacent pins for thermal damage."
     },
     {
-        "user": "Visual inspection of the hydraulic turbine blade shows a 2mm pitting corrosion spot on the leading edge. Material is Titanium alloy.",
-        "assistant": "Pitting corrosion of 2mm on the leading edge of a Titanium alloy hydraulic turbine blade exceeds the operational tolerance outlined in ISO 11366. It disrupts hydrodynamic efficiency and can lead to cavitation erosion over time.\n\n**Verdict**: WARN\n**Severity**: MEDIUM\n**Remediation**: Flag for scheduled maintenance. The pitting should be ground smooth and re-coated with an anti-cavitation protective layer during the next service cycle."
     },
     {
-        "user": "Engine gasket seal appears slightly discolored but maintains full contact and no pressure leak detected during the 100psi test.",
-        "assistant": "Discoloration of the gasket without pressure loss at 100psi is within acceptable limits for standard thermal wear per manufacturer baseline specs. No functional impairment is detected.\n\n**Verdict**: PASS\n**Severity**: LOW\n**Remediation**: No immediate action required. Continue standard monitoring."
     }
 ]
 def generate_dataset(num_samples: int = 100):
     """
-    Generates a synthetic JSONL dataset formatted in ChatML.
-    In a real scenario, you could use an LLM API (e.g. GPT-4 or Claude 3.5 Sonnet)
-    in a loop here to generate thousands of diverse examples.
     """
     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
@@ -41,8 +41,8 @@ def generate_dataset(num_samples: int = 100):
     with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
         for _ in range(num_samples):
-            # For demonstration, we just randomly sample from our templates
-            # A real generator would use an LLM to generate variations
             sample = random.choice(SYNTHETIC_SAMPLES)
             chatml_format = {

 # We will save the data to the 'data' directory relative to where the script is run
 OUTPUT_DIR = Path("data")
+OUTPUT_FILE = OUTPUT_DIR / "infrastructure_compliance.jsonl"
+# Synthetic data templates focused on Infrastructure Standards (Eurocodes, ASTM, ISO 9001)
+# and 'Violation Reasoning'
+SYSTEM_PROMPT = "You are an expert infrastructure compliance auditor. Given a technical description of a construction element, compare it against relevant engineering standards (Eurocodes, ASTM Standards, ISO 9001) to identify any violations. You must output the label 'Compliant' or 'Non-Compliant' followed by a detailed reasoning trace."
 SYNTHETIC_SAMPLES = [
     {
+        "user": "A bridge pier is constructed using concrete with a compressive strength of 25 MPa. It is exposed to a severe marine environment (XS3 exposure class).",
+        "assistant": "**Label**: Non-Compliant\n\n**Reasoning Trace**: According to Eurocode 2 (EN 1992-1-1), structures in exposure class XS3 (tidal, splash, and spray zones in marine environments) require a minimum concrete strength class of C35/45 to ensure adequate durability and resistance to chloride-induced corrosion. The provided compressive strength of 25 MPa (C25/30) is insufficient for this exposure class, violating the structural code requirements."
     },
     {
+        "user": "The quality management system documentation includes a calibration record for the torque wrenches used on the primary structural steel connections. The calibration was performed 6 months ago, and the required interval is 12 months.",
+        "assistant": "**Label**: Compliant\n\n**Reasoning Trace**: Under ISO 9001:2015 Clause 7.1.5 (Monitoring and measuring resources), organizations must retain appropriate documented information as evidence of fitness for purpose of monitoring and measurement resources. The torque wrenches are calibrated within the required 12-month interval (performed 6 months ago), satisfying the quality management standard for infrastructure construction."
     },
     {
+        "user": "Structural steel beams are connected using high-strength structural bolts (ASTM A325). The bolts are installed using the snug-tightened method for a slip-critical connection.",
+        "assistant": "**Label**: Non-Compliant\n\n**Reasoning Trace**: In accordance with the AISC Specification and ASTM standards for structural joints using high-strength bolts, slip-critical connections require bolts to be fully tensioned (e.g., using turn-of-nut, calibrated wrench, or direct tension indicators). The snug-tightened method is only permitted for bearing-type connections where slip is acceptable. Using snug-tightened bolts for a slip-critical connection violates the installation standard and poses a risk of joint failure under service loads."
     },
     {
+        "user": "Reinforcing steel (rebar) for the foundation mat is specified as ASTM A615 Grade 60. The rebar is delivered with mill test reports confirming a minimum yield strength of 60,000 psi.",
+        "assistant": "**Label**: Compliant\n\n**Reasoning Trace**: ASTM A615 Grade 60 requires deformed carbon-steel bars for concrete reinforcement to have a minimum yield strength of 60,000 psi. The mill test reports confirm that the delivered material meets this mechanical property requirement, ensuring it complies with the specified ASTM standard for the foundation mat."
     }
 ]
 def generate_dataset(num_samples: int = 100):
     """
+    Generates a synthetic JSONL dataset formatted in ChatML for Axolotl fine-tuning.
+    Focuses on 'Violation Reasoning' for Infrastructure Compliance.
     """
     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
     with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
         for _ in range(num_samples):
+            # For demonstration, we randomly sample from our templates.
+            # In production, an LLM pipeline could generate varied scenarios.
             sample = random.choice(SYNTHETIC_SAMPLES)
             chatml_format = {

backend/amd_hackathon/track3_vision.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from PIL import Image
+def analyze_construction_site(image_path: str, device: str = "cuda") -> str:
+    """
+    Uses Qwen2-VL (Track 3) to process a construction site image (e.g., from a drone)
+    and output a structured technical description. This description acts as the 'Context'
+    for the fine-tuned Track 2 Compliance Auditor model.
+    """
+    # Initialize the model and processor
+    # We use a placeholder path for the Qwen2-VL model here.
+    model_id = "Qwen/Qwen2-VL-7B-Instruct"
+    print(f"Loading {model_id} on {device}...")
+    try:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+            device_map=device
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+    except Exception as e:
+        print(f"Model loading failed (this is expected if weights aren't downloaded): {e}")
+        # Return a mocked structured output for demonstration purposes in the hackathon
+        return _mocked_vision_output()
+    # Load the image
+    try:
+        image = Image.open(image_path).convert("RGB")
+    except Exception as e:
+        raise ValueError(f"Could not load image at {image_path}: {e}")
+    # Prepare the prompt tailored for technical extraction
+    prompt = (
+        "You are an expert construction site inspector. Describe the structural elements, "
+        "materials, and construction practices visible in this image. Focus on technical "
+        "details like concrete pouring, rebar placement, structural steel connections, "
+        "and any visible environmental exposure factors. Be highly descriptive and objective."
+    )
+    # Qwen2-VL format
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    # Preprocess inputs
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = processor.image_processor(image), None
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+    # Generate output
+    print("Analyzing image...")
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return _format_for_track2(output_text)
+def _mocked_vision_output() -> str:
+    """Provides a mocked output when running without the heavy VLM weights."""
+    mocked_description = (
+        "A bridge pier is constructed using concrete. Reinforcement bars are visible with approximately "
+        "50mm of concrete cover. The pier is located directly in a tidal splash zone (marine environment). "
+        "Concrete surface appears to have minor honeycombing at the base."
+    )
+    return _format_for_track2(mocked_description)
+def _format_for_track2(vision_text: str) -> str:
+    """
+    Structures the vision output so it can be seamlessly passed
+    as input 'Context' to the fine-tuned 35B model.
+    """
+    structured_context = (
+        "### VISUAL INSPECTION REPORT (TRACK 3)\n"
+        f"{vision_text}\n\n"
+        "### TASK\n"
+        "Based on the visual inspection report above, identify any violations of structural codes "
+        "(e.g., Eurocodes, ASTM, ISO 9001). Provide a label of 'Compliant' or 'Non-Compliant' "
+        "followed by a detailed reasoning trace."
+    )
+    return structured_context
+if __name__ == "__main__":
+    # Test the pipeline
+    test_image = "dummy_construction_site.jpg"
+    print(f"Testing Multimodal Pipeline with {test_image}")
+    try:
+        context_for_track2 = analyze_construction_site(test_image)
+        print("\n--- Structured Output for Track 2 Model ---\n")
+        print(context_for_track2)
+        print("\n-------------------------------------------\n")
+    except Exception as e:
+        print(f"Error: {e}")

backend/amd_hackathon/verify_env.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import sys
+import torch
+def verify_environment():
+    print("========================================")
+    print("  MI300X ROCm Environment Verification  ")
+    print("========================================")
+    # Check PyTorch
+    print(f"\n[1] PyTorch Version: {torch.__version__}")
+    if not torch.cuda.is_available():
+        print("❌ CUDA/HIP is not available. Please check your ROCm installation.")
+        sys.exit(1)
+    print("✅ PyTorch is installed with CUDA/HIP support.")
+    # Check ROCm specific device properties
+    device_count = torch.cuda.device_count()
+    print(f"    Available GPUs: {device_count}")
+    for i in range(device_count):
+        print(f"    GPU {i}: {torch.cuda.get_device_name(i)}")
+        # Check VRAM
+        vram = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)
+        print(f"    VRAM GPU {i}: {vram:.2f} GB")
+        if "MI300" in torch.cuda.get_device_name(i):
+            print("    ✅ MI300X detected.")
+    # Check DeepSpeed
+    print("\n[2] Checking DeepSpeed...")
+    try:
+        import deepspeed
+        print(f"✅ DeepSpeed Version: {deepspeed.__version__}")
+    except ImportError:
+        print("❌ DeepSpeed is not installed.")
+    # Check Flash Attention
+    print("\n[3] Checking Flash Attention 2 (ROCm)...")
+    try:
+        import flash_attn
+        print(f"✅ Flash Attention 2 Version: {flash_attn.__version__}")
+    except ImportError:
+        print("❌ Flash Attention 2 is not installed or not configured for ROCm.")
+    # Check Axolotl
+    print("\n[4] Checking Axolotl...")
+    try:
+        import axolotl
+        print("✅ Axolotl is installed.")
+    except ImportError:
+        print("❌ Axolotl is not installed.")
+    print("\n========================================")
+    print("          Verification Complete           ")
+    print("========================================")
+if __name__ == "__main__":
+    verify_environment()