Spaces:

muhammadnoman76
/

cortex

Sleeping

muhammadnoman76 commited on Apr 30, 2025

Commit

433a189

1 Parent(s): f98f8ce

Fix build issues and optimize Dockerfile

Files changed (5) hide show

Dockerfile CHANGED Viewed

@@ -1,8 +1,8 @@
-FROM python:3.10-slim
 WORKDIR /code
-# Copy packages.txt and install system dependencies
 COPY packages.txt /root/packages.txt
 RUN apt-get update && \
     xargs -r -a /root/packages.txt apt-get install -y && \
@@ -12,20 +12,15 @@ RUN apt-get update && \
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# Install llama-cpp-python separately to handle potential issues
-RUN pip install --no-cache-dir llama-cpp-python
-# Set Hugging Face cache directory to a writable location
 ENV HF_HOME=/code/.cache/huggingface
 RUN mkdir -p /code/.cache/huggingface && \
-    chmod -R 777 /code/.cache
 # Copy application code
 COPY . .
-# Ensure correct permissions for the working directory
-RUN chmod -R 777 /code
 # Expose port
 EXPOSE 7860

+FROM python:3.12
 WORKDIR /code
+# Install system dependencies
 COPY packages.txt /root/packages.txt
 RUN apt-get update && \
     xargs -r -a /root/packages.txt apt-get install -y && \
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Pre-download the model
 ENV HF_HOME=/code/.cache/huggingface
 RUN mkdir -p /code/.cache/huggingface && \
+    pip install huggingface_hub && \
+    python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='muhammadnoman76/cortex_q4', filename='unsloth.Q4_K_M.gguf', local_dir='/code', local_dir_use_symlinks=False)"
 # Copy application code
 COPY . .
 # Expose port
 EXPOSE 7860

README.md CHANGED Viewed

@@ -10,10 +10,11 @@ license: afl-3.0
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # LLM Streaming API
 This Space provides a FastAPI application that streams responses from the Cortex LLM model.
-- Visit `/ui` for a simple interface to test the model
-- Send POST requests to `/generate` with JSON body containing `task_description`, `max_tokens` (optional), and `temperature` (optional)

 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # LLM Streaming API
 This Space provides a FastAPI application that streams responses from the Cortex LLM model.
+- Send GET requests to `/stream?task=<your_task>` to receive a streamed response from the model.
+- Example: `/stream?task=make an agent which send mail by searching top 5 website from google`
+**Note**: The `/ui` endpoint is not implemented in the current version.

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
-from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import asyncio
 from fastapi.middleware.cors import CORSMiddleware
@@ -15,14 +14,13 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Download the GGUF file
-model_id = "muhammadnoman76/cortex_q4"
-gguf_filename = "unsloth.Q4_K_M.gguf"  # Replace with the correct filename
-model_path = hf_hub_download(
-    repo_id=model_id,
-    filename=gguf_filename,
-    local_dir=".",
-    local_dir_use_symlinks=False
 )
 alpaca_prompt = """
@@ -51,14 +49,6 @@ Important notes:
 ### Response:
 """
-# Load model from local file in the copied folder
-llm = Llama(
-    model_path= r'.//unsloth.Q4_K_M.gguf',
-    n_ctx=2048,
-    n_batch=512,
-    verbose=False
-)
 async def stream_llm_response(task_description: str):
     prompt = alpaca_prompt.format(task_description)
     stream = llm(
@@ -77,4 +67,4 @@ async def stream_response(task: str = "make an agent which send mail by searchin
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from llama_cpp import Llama
 import asyncio
 from fastapi.middleware.cors import CORSMiddleware
     allow_headers=["*"],
 )
+# Load model from local file
+model_path = "./unsloth.Q4_K_M.gguf"
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,
+    n_batch=512,
+    verbose=False
 )
 alpaca_prompt = """
 ### Response:
 """
 async def stream_llm_response(task_description: str):
     prompt = alpaca_prompt.format(task_description)
     stream = llm(
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

packages.txt CHANGED Viewed

@@ -2,4 +2,4 @@ build-essential
 cmake
 git
 libopenblas-dev
-libomp-dev

 cmake
 git
 libopenblas-dev
+libomp-dev

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-fastapi>=0.115.12
-uvicorn>=0.34.2
-pydantic>=2.11.4
-llama-cpp-python>=0.3.8
-huggingface_hub>=0.25.0

+fastapi==0.115.12
+uvicorn==0.34.2
+pydantic==2.11.4
+llama-cpp-python==0.3.8
+huggingface_hub==0.30.2