Spaces:

Fred808
/

tserv

Paused

App Files Files Community

Fred808 commited on Oct 14, 2025

Commit

15bcc79

verified ·

1 Parent(s): 45e602a

Upload 2 files

Browse files

Files changed (2) hide show

requirements.txt +2 -1
tensor_server.py +75 -25

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ uvicorn==0.23.2
 torch>=2.0.0
 numpy>=1.24.0
 psutil>=5.9.0
-pydantic>=2.0.0

 torch>=2.0.0
 numpy>=1.24.0
 psutil>=5.9.0
+pydantic>=2.0.0
+python-multipart

tensor_server.py CHANGED Viewed

@@ -18,7 +18,7 @@ class Settings:
     SERVER_ID = os.getenv("SERVER_ID", "tensor1")  # Unique ID for this tensor server
     # The IP or hostname where this tensor server is accessible
-    PUBLIC_URL = os.getenv("PUBLIC_URL", f"https://fred808-ilob.hf.space")
     # URLs for other services (should be actual IP addresses or hostnames)
     CONTROLLER_URL = os.getenv("CONTROLLER_URL", "http://192.168.1.100:8000")
@@ -132,27 +132,34 @@ def load_chunk(chunk: ModelChunk) -> torch.nn.Module:
         os.makedirs(Settings.MODEL_DIR, exist_ok=True)
         # Get chunk configuration
-        input_size = chunk.config["input_size"]
-        output_size = chunk.config["output_size"]
-        weight_keys = chunk.config["weight_keys"]
-        # Create a simple linear transformation for this chunk
-        chunk_model = torch.nn.Linear(input_size, output_size)
-        chunk_model = chunk_model.to(Settings.DEVICE)
-        # Load the weights
         chunk_file = os.path.join(Settings.MODEL_DIR, chunk.files[0])
-        if os.path.exists(chunk_file):
-            weights = torch.load(chunk_file, map_location=Settings.DEVICE)
-            # Initialize weights from the loaded state dict
-            with torch.no_grad():
-                # Combine weights if multiple keys
-                if len(weight_keys) > 1:
-                    combined_weight = torch.cat([weights[k] for k in weight_keys], dim=0)
-                    chunk_model.weight.copy_(combined_weight)
-                else:
-                    chunk_model.weight.copy_(weights[weight_keys[0]])
         return chunk_model
@@ -186,18 +193,61 @@ async def get_metrics():
     """Get current server metrics"""
     return await collect_metrics()
 @app.post("/load_chunk")
 async def load_model_chunk(chunk: ModelChunk):
-    """Load a model chunk into memory"""
     try:
-        # Load the chunk
         chunk_model = load_chunk(chunk)
-        state.loaded_chunks[chunk.chunk_id] = chunk_model
         return {
             "status": "loaded",
-            "chunk_id": chunk.chunk_id,
-            "device": str(next(chunk_model.parameters()).device)
         }
     except Exception as e:

     SERVER_ID = os.getenv("SERVER_ID", "tensor1")  # Unique ID for this tensor server
     # The IP or hostname where this tensor server is accessible
+    PUBLIC_URL = os.getenv("PUBLIC_URL", f"http://192.168.1.101:8001")
     # URLs for other services (should be actual IP addresses or hostnames)
     CONTROLLER_URL = os.getenv("CONTROLLER_URL", "http://192.168.1.100:8000")
         os.makedirs(Settings.MODEL_DIR, exist_ok=True)
         # Get chunk configuration
+        chunk_config = chunk.config
+        if "original_file" not in chunk_config:
+            raise ValueError("Missing original_file in chunk configuration")
+        # Save chunk data to file
         chunk_file = os.path.join(Settings.MODEL_DIR, chunk.files[0])
+        if not os.path.exists(chunk_file):
+            # We'll need to receive the actual chunk data in a separate request
+            raise ValueError(f"Chunk file not found: {chunk_file}")
+        # For raw binary chunks, we'll create a simple buffer module
+        class ChunkBuffer(torch.nn.Module):
+            def __init__(self, chunk_path: str, config: Dict):
+                super().__init__()
+                self.chunk_path = chunk_path
+                self.config = config
+                self.start_offset = config.get('start_offset', 0)
+                self.size = config.get('size_bytes', 0)
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # In a real implementation, this would process the input
+                # using the chunk data. For now, we'll just return the input
+                # as this is just for testing the distribution system
+                return x
+        # Create and return the chunk buffer
+        chunk_model = ChunkBuffer(chunk_file, chunk_config)
+        print(f"[INFO] Loaded chunk {chunk.chunk_id} ({chunk_config.get('size_bytes', 0)} bytes) from {chunk.files[0]}")
         return chunk_model
     """Get current server metrics"""
     return await collect_metrics()
+from fastapi import File, UploadFile
 @app.post("/load_chunk")
 async def load_model_chunk(chunk: ModelChunk):
+    """Register a chunk configuration"""
+    try:
+        # Create model directory if it doesn't exist
+        os.makedirs(Settings.MODEL_DIR, exist_ok=True)
+        # Store the chunk metadata
+        chunk_file = os.path.join(Settings.MODEL_DIR, chunk.files[0])
+        state.chunk_configs = getattr(state, 'chunk_configs', {})
+        state.chunk_configs[chunk.chunk_id] = chunk
+        print(f"[INFO] Registered chunk {chunk.chunk_id} configuration")
+        print(f"[INFO] Waiting for chunk data: {chunk.files[0]}")
+        return {
+            "status": "configured",
+            "chunk_id": chunk.chunk_id,
+            "ready_for_data": True
+        }
+    except Exception as e:
+        state.error_count += 1
+        state.last_error = str(e)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/upload_chunk_data/{chunk_id}")
+async def upload_chunk_data(chunk_id: int, file: UploadFile = File(...)):
+    """Receive the actual chunk data"""
     try:
+        if chunk_id not in getattr(state, 'chunk_configs', {}):
+            raise HTTPException(status_code=400, detail="Chunk configuration not registered")
+        chunk = state.chunk_configs[chunk_id]
+        chunk_file = os.path.join(Settings.MODEL_DIR, chunk.files[0])
+        # Save the uploaded file
+        with open(chunk_file, 'wb') as f:
+            content = await file.read()
+            f.write(content)
+        # Now load the chunk
         chunk_model = load_chunk(chunk)
+        state.loaded_chunks[chunk_id] = chunk_model
+        file_size = os.path.getsize(chunk_file)
+        print(f"[INFO] Received and loaded chunk {chunk_id} data ({file_size} bytes)")
         return {
             "status": "loaded",
+            "chunk_id": chunk_id,
+            "size_bytes": file_size,
+            "file": chunk.files[0]
         }
     except Exception as e: