visamram02 commited on
Commit
321dc65
·
verified ·
1 Parent(s): 41c8100

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. Dockerfile +13 -2
  2. app.py +80 -7
Dockerfile CHANGED
@@ -2,12 +2,23 @@ FROM python:3.10-slim
2
 
3
  # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
 
 
 
 
5
  && rm -rf /var/lib/apt/lists/*
6
 
7
- RUN pip install fastapi uvicorn
 
 
 
 
 
 
 
8
 
9
  COPY app.py .
10
 
11
  EXPOSE 7860
12
 
13
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
2
 
3
  # Install system dependencies
4
  RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ wget \
7
+ libgomp1 \
8
+ libopenblas0 \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Install llama-cpp-python with pre-built wheel (Luigi repo)
12
+ RUN pip install https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
13
+
14
+ # Install Gradio and other UI dependencies
15
+ RUN pip install gradio numpy
16
+
17
+ # Download the model (Qwen 2.5 7B Instruct Quantized Q4_K_M)
18
+ RUN wget https://huggingface.co/bartowski/Qwen2.5-7B-Instruct-GGUF/resolve/main/Qwen2.5-7B-Instruct-Q4_K_M.gguf -O model.gguf
19
 
20
  COPY app.py .
21
 
22
  EXPOSE 7860
23
 
24
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -1,11 +1,84 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  app = FastAPI()
4
 
5
- @app.get("/")
6
- def read_root():
7
- return {"status": "SUCCESS", "message": "VisamIntelli-Flash Hello World"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- @app.get("/v1/models")
10
- def read_models():
11
- return {"data": [{"id": "test-model"}]}
 
1
+ import gradio as gr
2
+ from llama_cpp import Llama
3
+ import os
4
+ import json
5
+ from fastapi import FastAPI, Request
6
+ from fastapi.responses import JSONResponse, StreamingResponse
7
+ import threading
8
 
9
+ # Download model if not exists
10
+ model_path = "model.gguf"
11
+
12
+ print(f"Loading model from {model_path}...")
13
+ llm = Llama(
14
+ model_path=model_path,
15
+ n_ctx=4096,
16
+ n_threads=4,
17
+ verbose=False
18
+ )
19
+
20
+ def predict(message, history):
21
+ prompt = ""
22
+ for user_msg, assistant_msg in history:
23
+ prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
24
+ prompt += f"User: {message}\nAssistant:"
25
+
26
+ output = llm(
27
+ prompt,
28
+ max_tokens=512,
29
+ stop=["User:"],
30
+ echo=False,
31
+ stream=True
32
+ )
33
+
34
+ response = ""
35
+ for chunk in output:
36
+ delta = chunk['choices'][0]['text']
37
+ response += delta
38
+ yield response
39
+
40
+ demo = gr.ChatInterface(
41
+ fn=predict,
42
+ title="VisamIntelli-Flash",
43
+ description="Your private AI brain on Hugging Face.",
44
+ )
45
+
46
+ # Create FastAPI app
47
  app = FastAPI()
48
 
49
+ # Mount Gradio after defining demo
50
+ app = gr.mount_gradio_app(app, demo, path="/")
51
+
52
+ @app.post("/v1/chat/completions")
53
+ async def chat_completions(request: Request):
54
+ data = await request.json()
55
+ messages = data.get("messages", [])
56
+ stream = data.get("stream", False)
57
+
58
+ # Simple prompt builder
59
+ prompt = ""
60
+ for m in messages:
61
+ role = m.get("role", "user")
62
+ content = m.get("content", "")
63
+ prompt += f"{role.capitalize()}: {content}\n"
64
+ prompt += "Assistant:"
65
+
66
+ if not stream:
67
+ output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
68
+ text = output['choices'][0]['text']
69
+ return JSONResponse({
70
+ "choices": [{"message": {"content": text}}]
71
+ })
72
+ else:
73
+ def generate():
74
+ output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024, stream=True)
75
+ for chunk in output:
76
+ text = chunk['choices'][0]['text']
77
+ yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
78
+ yield "data: [DONE]\n\n"
79
+
80
+ return StreamingResponse(generate(), media_type="text/event-stream")
81
 
82
+ if __name__ == "__main__":
83
+ import uvicorn
84
+ uvicorn.run(app, host="0.0.0.0", port=7860)