Spaces:

visamram02
/

VisamIntelli-Flash

Sleeping

App Files Files Community

visamram02 commited on Mar 16

Commit

321dc65

verified ·

1 Parent(s): 41c8100

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

Dockerfile +13 -2
app.py +80 -7

Dockerfile CHANGED Viewed

@@ -2,12 +2,23 @@ FROM python:3.10-slim
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
-RUN pip install fastapi uvicorn
 COPY app.py .
 EXPOSE 7860
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 # Install system dependencies
 RUN apt-get update && apt-get install -y \
+    build-essential \
+    wget \
+    libgomp1 \
+    libopenblas0 \
     && rm -rf /var/lib/apt/lists/*
+# Install llama-cpp-python with pre-built wheel (Luigi repo)
+RUN pip install https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
+# Install Gradio and other UI dependencies
+RUN pip install gradio numpy
+# Download the model (Qwen 2.5 7B Instruct Quantized Q4_K_M)
+RUN wget https://huggingface.co/bartowski/Qwen2.5-7B-Instruct-GGUF/resolve/main/Qwen2.5-7B-Instruct-Q4_K_M.gguf -O model.gguf
 COPY app.py .
 EXPOSE 7860
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -1,11 +1,84 @@
-from fastapi import FastAPI
 app = FastAPI()
-@app.get("/")
-def read_root():
-    return {"status": "SUCCESS", "message": "VisamIntelli-Flash Hello World"}
-@app.get("/v1/models")
-def read_models():
-    return {"data": [{"id": "test-model"}]}

+import gradio as gr
+from llama_cpp import Llama
+import os
+import json
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+import threading
+# Download model if not exists
+model_path = "model.gguf"
+print(f"Loading model from {model_path}...")
+llm = Llama(
+    model_path=model_path,
+    n_ctx=4096,
+    n_threads=4,
+    verbose=False
+)
+def predict(message, history):
+    prompt = ""
+    for user_msg, assistant_msg in history:
+        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
+    prompt += f"User: {message}\nAssistant:"
+    output = llm(
+        prompt,
+        max_tokens=512,
+        stop=["User:"],
+        echo=False,
+        stream=True
+    )
+    response = ""
+    for chunk in output:
+        delta = chunk['choices'][0]['text']
+        response += delta
+        yield response
+demo = gr.ChatInterface(
+    fn=predict,
+    title="VisamIntelli-Flash",
+    description="Your private AI brain on Hugging Face.",
+)
+# Create FastAPI app
 app = FastAPI()
+# Mount Gradio after defining demo
+app = gr.mount_gradio_app(app, demo, path="/")
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    data = await request.json()
+    messages = data.get("messages", [])
+    stream = data.get("stream", False)
+    # Simple prompt builder
+    prompt = ""
+    for m in messages:
+        role = m.get("role", "user")
+        content = m.get("content", "")
+        prompt += f"{role.capitalize()}: {content}\n"
+    prompt += "Assistant:"
+    if not stream:
+        output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
+        text = output['choices'][0]['text']
+        return JSONResponse({
+            "choices": [{"message": {"content": text}}]
+        })
+    else:
+        def generate():
+            output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024, stream=True)
+            for chunk in output:
+                text = chunk['choices'][0]['text']
+                yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
+            yield "data: [DONE]\n\n"
+        return StreamingResponse(generate(), media_type="text/event-stream")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)