Spaces:

Bc-AI
/

head

Runtime error

App Files Files Community

Bc-AI commited on Dec 18, 2025

Commit

8777866

verified ·

1 Parent(s): 186936c

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

Dockerfile +23 -0
README.md +12 -10
app.py +691 -0
requirements.txt +12 -0
space-config.yaml +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app.py .
+COPY shared ./shared
+# Expose port for the API
+EXPOSE 7860
+# Start the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
----
-title: Head
-emoji: 🏆
-colorFrom: yellow
-colorTo: pink
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SACCP Head Node
+This is a head node in the SACCP (Scalable Accelerated Compute Protocol) distributed computing network.
+## Node Type: HEAD
+- Processes tasks according to SACCP protocol
+- Contributes computational resources to the network
+- Earns cloud credits for resource contribution
+## Architecture
+- Built with FastAPI and TensorFlow/Keras
+- Implements fault-tolerant operations
+- Integrated with SACCP credit system

app.py ADDED Viewed

	@@ -0,0 +1,691 @@

+import os
+import time
+import json
+import requests
+import asyncio
+import random
+from datetime import datetime
+from typing import Dict, List, Optional
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import uvicorn
+from pydantic import BaseModel
+from shared.models import ChatRequest, ChatResponse, ChatMessage, WorkerStatus, NodeType
+from shared.node_types import NodeRegistrationRequest, NodeRegistrationResponse, NodeListResponse, NodeStatus, ServiceOffering, ServiceRequest
+from shared.approval_system import smilyai_approval_system, ApprovalType
+from shared.credits_system import credits_system, CreditReason, TransactionType
+from shared.fault_tolerance import fault_tolerance_manager, FailureType, RecoveryStrategy
+from shared.load_balancer import load_balancer, Task, TaskPriority
+from shared.chat_history import save_detailed_chat_log, initialize_chat_file
+app = FastAPI(
+    title="Multi-Node Hugging Face API Gateway",
+    description="API Gateway that routes requests to specialized worker nodes",
+    version="1.0.0"
+)
+# Initialize chat history file
+initialize_chat_file()
+# Configuration - in production, these would come from environment variables
+WORKER_NODES = {
+    "sam-x-nano": os.getenv("NANO_WORKER_URL", "http://nano-worker:8000"),
+    "sam-x-mini": os.getenv("MINI_WORKER_URL", "http://mini-worker:8000"),
+    "sam-x-fast": os.getenv("FAST_WORKER_URL", "http://fast-worker:8000"),
+    "sam-x-large": os.getenv("LARGE_WORKER_URL", "http://large-worker:8000"),
+    "sam-large-2": os.getenv("SAM2_WORKER_URL", "http://sam2-worker:8000"),  # Added Sam 2 support
+    "universal": os.getenv("UNIVERSAL_WORKER_URL", "http://universal-worker:8000"),  # Universal worker that supports all models
+}
+# In-memory worker status tracking (in production, use Redis or database)
+worker_status = {}
+@app.on_event('startup')
+def startup_event():
+    print("Starting Multi-Node Hugging Face API Gateway...")
+    # Initialize worker status
+    for model, url in WORKER_NODES.items():
+        worker_status[model] = {"active": True, "last_check": time.time(), "load": 0.0}
+def route_to_worker(chat_request: ChatRequest) -> Dict:
+    """
+    Route the request to the appropriate worker node based on model
+    """
+    model = chat_request.model.lower()
+    # Check if model is supported
+    if model not in WORKER_NODES:
+        # Find closest matching model
+        available_models = [m for m in WORKER_NODES.keys() if model in m or m in model]
+        if available_models:
+            model = available_models[0]  # Use first available match
+        else:
+            raise HTTPException(status_code=400, detail=f"Model {chat_request.model} not available")
+    worker_url = WORKER_NODES[model]
+    # Make request to worker
+    try:
+        response = requests.post(
+            f"{worker_url}/chat/completions",
+            json=chat_request.dict(),
+            timeout=300,  # 5 minute timeout for long inference
+            stream=chat_request.stream  # Enable streaming if requested
+        )
+        response.raise_for_status()
+        if chat_request.stream:
+            # For streaming, return response object to be handled by streaming function
+            return {"streaming": True, "response": response}
+        else:
+            return response.json()
+    except requests.exceptions.RequestException as e:
+        print(f"Error contacting worker {worker_url}: {str(e)}")
+        worker_status[model] = {"active": False, "last_check": time.time(), "load": 0.0}
+        raise HTTPException(status_code=503, detail=f"Worker for model {model} is not available")
+    except Exception as e:
+        print(f"Unexpected error contacting worker {worker_url}: {str(e)}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+def route_streaming_request(chat_request: ChatRequest):
+    """
+    Handle streaming request by forwarding the stream from worker to client
+    """
+    model = chat_request.model.lower()
+    # Check if model is supported
+    if model not in WORKER_NODES:
+        # Find closest matching model
+        available_models = [m for m in WORKER_NODES.keys() if model in m or m in model]
+        if available_models:
+            model = available_models[0]  # Use first available match
+        else:
+            raise HTTPException(status_code=400, detail=f"Model {chat_request.model} not available")
+    worker_url = WORKER_NODES[model]
+    import requests
+    # Stream request to worker
+    worker_response = requests.post(
+        f"{worker_url}/chat/completions",
+        json=chat_request.dict(),
+        timeout=300,  # 5 minute timeout for long inference
+        stream=True
+    )
+    # Forward the stream
+    def generate():
+        for chunk in worker_response.iter_lines():
+            if chunk:
+                decoded_chunk = chunk.decode('utf-8')
+                yield decoded_chunk + "\n"
+    return StreamingResponse(generate(), media_type="text/event-stream")
+@app.post("/chat/completions", response_model=ChatResponse)
+async def chat_completions(request: ChatRequest, background_tasks: BackgroundTasks):
+    """
+    Main chat completions endpoint - routes to appropriate worker
+    """
+    start_time = time.time()
+    try:
+        # If streaming is requested, handle differently
+        if request.stream:
+            # For streaming, route directly to appropriate worker
+            return route_streaming_request(request)
+        # Route to appropriate worker for non-streaming requests
+        worker_response = route_to_worker(request)
+        # Calculate processing time
+        processing_time = time.time() - start_time
+        # Extract response content
+        response_content = ""
+        if "choices" in worker_response and len(worker_response["choices"]) > 0:
+            response_content = worker_response["choices"][0].get("message", {}).get("content", "")
+        # Save chat history in background
+        background_tasks.add_task(
+            save_detailed_chat_log,
+            request.dict(),
+            response_content,
+            request.model,
+            processing_time
+        )
+        return worker_response
+    except HTTPException:
+        # Re-raise HTTP exceptions
+        raise
+    except Exception as e:
+        print(f"Error in chat_completions: {str(e)}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+@app.get("/models")
+async def list_models():
+    """
+    List available models
+    """
+    available_models = [model for model, url in WORKER_NODES.items()
+                       if worker_status.get(model, {}).get("active", True)]
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": model,
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "multinode-hf-api"
+            }
+            for model in available_models
+        ]
+    }
+@app.get("/health")
+async def health_check():
+    """
+    Health check endpoint
+    """
+    active_workers = {model: status for model, status in worker_status.items()
+                      if status.get("active", False)}
+    return {
+        "status": "healthy" if active_workers else "no_active_workers",
+        "active_workers": list(active_workers.keys()),
+        "total_workers": len(WORKER_NODES)
+    }
+@app.get("/worker-status")
+async def get_worker_status():
+    """
+    Get detailed status of all workers
+    """
+    return worker_status
+@app.post("/chat")
+async def simple_chat(message: str, model: str = "sam-x-nano", max_tokens: int = 512):
+    """
+    Simplified chat endpoint for basic interactions
+    """
+    chat_request = ChatRequest(
+        messages=[ChatMessage(role="user", content=message)],
+        model=model,
+        max_tokens=max_tokens
+    )
+    worker_response = route_to_worker(chat_request)
+    if "choices" in worker_response and len(worker_response["choices"]) > 0:
+        return {"response": worker_response["choices"][0]["message"]["content"]}
+    else:
+        raise HTTPException(status_code=500, detail="No response from worker")
+# Available services in the marketplace
+marketplace_services = [
+    ServiceOffering(
+        service_id="storage_1",
+        service_name="SACCP Cloud Storage",
+        description="Distributed storage on SACCP network",
+        price_per_unit=0.01,  # 0.01 credits per GB/month
+        unit_type="gb_month"
+    ),
+    ServiceOffering(
+        service_id="compute_1",
+        service_name="SACCP Compute Power",
+        description="Distributed computing on SACCP network",
+        price_per_unit=0.10,  # 0.10 credits per compute hour
+        unit_type="compute_hour"
+    ),
+    ServiceOffering(
+        service_id="ai_model_hosting_1",
+        service_name="AI Model Hosting",
+        description="Host and serve AI models on SACCP network",
+        price_per_unit=0.05,  # 0.05 credits per model-hour
+        unit_type="model_hour"
+    )
+]
+# Smilyai approved head nodes (for security)
+approved_head_nodes = set()
+@app.post("/saccp/register-worker", response_model=NodeRegistrationResponse)
+async def register_worker(registration_request: NodeRegistrationRequest):
+    """
+    Register a worker node with the SACCP network
+    """
+    # For HEAD nodes, require smilyai approval
+    if registration_request.capabilities.node_type == NodeType.HEAD:
+        is_approved = smilyai_approval_system.is_approved(
+            registration_request.node_id,
+            ApprovalType.HEAD_NODE
+        )
+        if not is_approved:
+            # Request approval for HEAD node
+            request_id = smilyai_approval_system.request_approval(
+                node_id=registration_request.node_id,
+                endpoint=registration_request.endpoint,
+                request_type=ApprovalType.HEAD_NODE,
+                request_data=registration_request.dict(),
+                reason="HEAD node registration",
+                requested_by="system"
+            )
+            # For now, return pending approval
+            # In a real system, you might want to allow some limited access while pending
+            pending_requests = smilyai_approval_system.get_pending_requests()
+            is_still_pending = any(req.request_id == request_id for req in pending_requests)
+            if is_still_pending:
+                return NodeRegistrationResponse(
+                    success=False,
+                    node_id=registration_request.node_id,
+                    message="HEAD node registration requires approval, submitted for review",
+                    approval_status="pending"
+                )
+            else:
+                # Check if it was approved in the meantime
+                is_approved = smilyai_approval_system.is_approved(
+                    registration_request.node_id,
+                    ApprovalType.HEAD_NODE
+                )
+                if is_approved:
+                    # Add to approved head nodes
+                    approved_head_nodes.add(registration_request.node_id)
+                    # Register with fault tolerance system
+                    fault_tolerance_manager.register_node(
+                        registration_request.node_id,
+                        registration_request.capabilities.node_type,
+                        registration_request.capabilities.dict()
+                    )
+                    return NodeRegistrationResponse(
+                        success=True,
+                        node_id=registration_request.node_id,
+                        message=f"Successfully registered {registration_request.capabilities.node_type} node",
+                        approval_status="approved"
+                    )
+                else:
+                    return NodeRegistrationResponse(
+                        success=False,
+                        node_id=registration_request.node_id,
+                        message="HEAD node registration denied",
+                        approval_status="rejected"
+                    )
+        else:
+            # Add to approved head nodes
+            approved_head_nodes.add(registration_request.node_id)
+            # Register with fault tolerance system
+            fault_tolerance_manager.register_node(
+                registration_request.node_id,
+                registration_request.capabilities.node_type,
+                registration_request.capabilities.dict()
+            )
+    else:
+        # Register non-HEAD nodes with fault tolerance system
+        fault_tolerance_manager.register_node(
+            registration_request.node_id,
+            registration_request.capabilities.node_type,
+            registration_request.capabilities.dict()
+        )
+    # Register with load balancer
+    load_balancer.register_node(
+        registration_request.node_id,
+        registration_request.capabilities.node_type,
+        registration_request.capabilities.dict()
+    )
+    # In a real system, you would store the worker info in a database
+    # For now, we'll just return success
+    return NodeRegistrationResponse(
+        success=True,
+        node_id=registration_request.node_id,
+        message=f"Successfully registered {registration_request.capabilities.node_type} node",
+        approval_status="approved"  # In a real system, this might be "pending" initially
+    )
+@app.post("/saccp/heartbeat")
+async def heartbeat(worker_id: str):
+    """
+    Worker heartbeat to maintain connection with the network
+    """
+    # Record heartbeat in fault tolerance system
+    ft_success = fault_tolerance_manager.heartbeat(worker_id)
+    # Record heartbeat in load balancer
+    lb_success = load_balancer.heartbeat_node(worker_id)
+    if ft_success and lb_success:
+        return {"status": "alive", "timestamp": int(time.time())}
+    else:
+        status = "alive" if ft_success or lb_success else "unknown_node"
+        return {"status": status, "timestamp": int(time.time())}
+@app.get("/saccp/next-task")
+async def get_next_task(worker_id: str):
+    """
+    Get the next task for a worker
+    """
+    # In a real system, check the task queue for available tasks for this worker
+    # based on the worker's capabilities
+    # For now, return empty dict meaning no tasks available
+    # In the real implementation, this would be handled by the load balancer
+    return {}  # Empty dict means no tasks available
+@app.post("/saccp/task-result")
+async def report_task_result(worker_id: str, task_id: str, result: Dict):
+    """
+    Report task completion result
+    """
+    # Record task completion in fault tolerance system
+    success = fault_tolerance_manager.record_task_completion(task_id, worker_id)
+    # Award credits to the worker for completing the task
+    # Different task types earn different amounts of credits
+    task_type = result.get('task_type', 'compute')
+    # Award credits based on task type and complexity
+    if task_type == 'inference':
+        credits_awarded = 0.1  # Small amount for inference tasks
+    elif task_type == 'training':
+        credits_awarded = 1.0  # Larger amount for training tasks
+    else:
+        credits_awarded = 0.5  # Default amount for other task types
+    # Add credits to worker
+    credits_system.add_credits(worker_id, credits_awarded, CreditReason.TASK_COMPLETION,
+                              metadata={"task_id": task_id, "task_type": task_type})
+    return {
+        "status": "received",
+        "credits_awarded": credits_awarded,
+        "task_completed": success,
+        "new_balance": credits_system.get_balance(worker_id).balance
+    }
+@app.post("/saccp/task-error")
+async def report_task_error(worker_id: str, task_id: str, error: str):
+    """
+    Report task error to the network
+    """
+    # Record task failure in fault tolerance system
+    recovery_strategy = fault_tolerance_manager.record_task_failure(
+        task_id, worker_id, FailureType.TASK_TIMEOUT, error
+    )
+    return {
+        "status": "error_received",
+        "recovery_strategy": recovery_strategy.value if recovery_strategy else "none"
+    }
+@app.get("/saccp/stats")
+async def get_network_stats():
+    """
+    Get network statistics
+    """
+    # Get statistics from fault tolerance system
+    health_stats = fault_tolerance_manager.get_network_health()
+    return health_stats
+@app.get("/saccp/health-detailed")
+async def get_detailed_health():
+    """
+    Get detailed network health including failed nodes
+    """
+    health_stats = fault_tolerance_manager.get_network_health()
+    failed_nodes = fault_tolerance_manager.get_failed_nodes()
+    return {
+        "network_health": health_stats,
+        "failed_nodes": failed_nodes,
+        "timestamp": int(time.time())
+    }
+@app.get("/saccp/nodes")
+async def get_nodes():
+    """
+    Get list of nodes in the network
+    """
+    # Get node status from load balancer
+    node_status = load_balancer.get_node_status()
+    return NodeListResponse(
+        nodes=node_status,
+        total_nodes=len(node_status),
+        online_nodes=len([n for n in node_status if n["is_available"]])
+    )
+@app.post("/saccp/submit-task")
+async def submit_task_for_distribution(task_data: Dict):
+    """
+    Submit a task for distribution across the network
+    """
+    task_id = task_data.get("task_id", f"task_{int(time.time())}_{random.randint(1000, 9999)}")
+    task_type = task_data.get("task_type", "compute")
+    # Determine task priority
+    priority_str = task_data.get("priority", "normal")
+    priority_map = {
+        "low": TaskPriority.LOW,
+        "normal": TaskPriority.NORMAL,
+        "high": TaskPriority.HIGH,
+        "critical": TaskPriority.CRITICAL
+    }
+    priority = priority_map.get(priority_str, TaskPriority.NORMAL)
+    # Create resource requirements
+    resource_requirements = task_data.get("resource_requirements", {})
+    # Create the task
+    task = Task(
+        task_id=task_id,
+        task_type=task_type,
+        priority=priority,
+        resource_requirements=resource_requirements,
+        estimated_duration=task_data.get("estimated_duration", 30.0),  # seconds
+        created_at=time.time()
+    )
+    # Submit to load balancer
+    assigned_node = load_balancer.submit_task(task)
+    return {
+        "task_id": task_id,
+        "status": "submitted",
+        "assigned_node": assigned_node,
+        "timestamp": int(time.time())
+    }
+@app.get("/saccp/load-balancer-status")
+async def get_load_balancer_status():
+    """
+    Get status of the load balancer
+    """
+    node_status = load_balancer.get_node_status()
+    queue_status = load_balancer.get_task_queue_status()
+    return {
+        "node_status": node_status,
+        "task_queue": queue_status,
+        "timestamp": int(time.time())
+    }
+@app.get("/credits/balance/{node_id}")
+async def get_credit_balance(node_id: str):
+    """
+    Get credit balance for a node
+    """
+    balance = credits_system.get_balance(node_id)
+    return balance
+@app.get("/credits/earn/{node_id}/{amount}")
+async def earn_credits(node_id: str, amount: float, reason: str = "task_completion"):
+    """
+    Endpoint for nodes to earn credits by contributing resources
+    """
+    try:
+        credit_reason = CreditReason(reason) if reason in CreditReason.__members__ else CreditReason.RESOURCE_CONTRIBUTION
+        success = credits_system.add_credits(node_id, amount, credit_reason)
+        if success:
+            balance = credits_system.get_balance(node_id)
+            return {"status": "success", "new_balance": balance.balance}
+        else:
+            return {"status": "failed", "message": "Failed to add credits"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+@app.get("/marketplace/services")
+async def get_marketplace_services():
+    """
+    Get list of available services in the marketplace
+    """
+    return marketplace_services
+@app.post("/marketplace/purchase")
+async def purchase_service(service_request: ServiceRequest):
+    """
+    Purchase a service from the marketplace
+    """
+    # Find the requested service
+    service = None
+    for s in marketplace_services:
+        if s.service_id == service_request.service_id:
+            service = s
+            break
+    if not service:
+        raise HTTPException(status_code=404, detail="Service not found")
+    if not service.availability:
+        raise HTTPException(status_code=400, detail="Service not available")
+    # Calculate total cost
+    total_cost = service.price_per_unit * service_request.quantity
+    # Attempt to spend credits
+    success = credits_system.spend_credits(
+        service_request.node_id,
+        total_cost,
+        CreditReason.SERVICE_PURCHASE,
+        service.service_name,
+        metadata=service_request.parameters
+    )
+    if not success:
+        raise HTTPException(status_code=400, detail="Insufficient credits")
+    # Get updated balance
+    balance = credits_system.get_balance(service_request.node_id)
+    return {
+        "status": "success",
+        "service_id": service.service_id,
+        "service_name": service.service_name,
+        "cost": total_cost,
+        "remaining_balance": balance.balance
+    }
+# Additional endpoints for credit earning based on node type and contributions
+@app.post("/credits/earn-resource-contribution")
+async def earn_credits_for_resource_contribution(node_id: str, node_type: NodeType, duration_hours: float,
+                                               resource_amount: float = 1.0):
+    """
+    Endpoint for nodes to earn credits by contributing resources to the network
+    Credits are awarded based on node type, duration, and amount of resources contributed
+    """
+    # Different node types earn different rates
+    base_rates = {
+        NodeType.RAM: 0.5,    # 0.5 credits per hour per resource unit for RAM nodes
+        NodeType.DISK: 0.3,   # 0.3 credits per hour per resource unit for disk nodes
+        NodeType.COMPUTE: 0.4, # 0.4 credits per hour per resource unit for compute nodes
+        NodeType.GPU: 1.0,    # 1.0 credits per hour per resource unit for GPU nodes
+        NodeType.TPU: 1.5,    # 1.5 credits per hour per resource unit for TPU nodes
+        NodeType.NPU: 1.2,    # 1.2 credits per hour per resource unit for NPU nodes
+        NodeType.HEAD: 0.8    # 0.8 credits per hour per resource unit for head nodes
+    }
+    rate = base_rates.get(node_type, 0.4)  # Default to compute rate
+    credits_to_earn = rate * duration_hours * resource_amount
+    success = credits_system.add_credits(
+        node_id,
+        credits_to_earn,
+        CreditReason.RESOURCE_CONTRIBUTION,
+        metadata={
+            "node_type": node_type,
+            "duration_hours": duration_hours,
+            "resource_amount": resource_amount
+        }
+    )
+    if success:
+        balance = credits_system.get_balance(node_id)
+        return {
+            "status": "success",
+            "credits_earned": credits_to_earn,
+            "new_balance": balance.balance
+        }
+    else:
+        return {"status": "failed", "message": "Failed to award credits"}
+@app.get("/credits/top-contributors")
+async def get_top_contributors(limit: int = 10):
+    """
+    Get the top contributing nodes in the network
+    """
+    top_nodes = credits_system.get_top_nodes_by_balance(limit)
+    return {
+        "top_contributors": top_nodes,
+        "total_nodes_in_network": len(top_nodes)  # This would be from a full node list in real implementation
+    }
+@app.get("/saccp/node-stats/{node_id}")
+async def get_node_stats(node_id: str):
+    """
+    Get comprehensive statistics for a node including credit information
+    """
+    balance = credits_system.get_balance(node_id)
+    transactions = credits_system.get_transaction_history(node_id, limit=10)
+    return {
+        "node_id": node_id,
+        "credit_balance": balance,
+        "recent_transactions": transactions,
+        "status": "active"  # This would check actual node status in a real implementation
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Requirements for Head Node
+fastapi==0.104.1
+uvicorn==0.24.0
+requests==2.31.0
+pydantic==2.5.0
+python-multipart==0.0.6
+huggingface_hub==0.20.1
+tokenizers==0.15.0
+transformers==4.35.2
+numpy==1.24.3
+pytz==2023.3.post1
+aiohttp==3.9.0

space-config.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# SACCP Node Space Configuration
+runtime:
+  cpu: "medium"
+  memory: "16x"
+  accelerator: "cpu"  # Will be configured based on node type
+env:
+  NODE_TYPE: "head"
+  NANO_WORKER_URL: "https://Bc-AI-worker-nano.hf.space"
+  MINI_WORKER_URL: "https://Bc-AI-worker-mini.hf.space"
+  FAST_WORKER_URL: "https://Bc-AI-worker-fast.hf.space"
+  LARGE_WORKER_URL: "https://Bc-AI-worker-large.hf.space"
+  UNIVERSAL_WORKER_URL: "https://Bc-AI-worker-universal.hf.space"