visamram02 commited on
Commit
ac0a230
·
verified ·
1 Parent(s): 321dc65

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +33 -34
app.py CHANGED
@@ -4,11 +4,10 @@ import os
4
  import json
5
  from fastapi import FastAPI, Request
6
  from fastapi.responses import JSONResponse, StreamingResponse
7
- import threading
8
 
9
- # Download model if not exists
10
  model_path = "model.gguf"
11
-
12
  print(f"Loading model from {model_path}...")
13
  llm = Llama(
14
  model_path=model_path,
@@ -17,37 +16,12 @@ llm = Llama(
17
  verbose=False
18
  )
19
 
20
- def predict(message, history):
21
- prompt = ""
22
- for user_msg, assistant_msg in history:
23
- prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
24
- prompt += f"User: {message}\nAssistant:"
25
-
26
- output = llm(
27
- prompt,
28
- max_tokens=512,
29
- stop=["User:"],
30
- echo=False,
31
- stream=True
32
- )
33
-
34
- response = ""
35
- for chunk in output:
36
- delta = chunk['choices'][0]['text']
37
- response += delta
38
- yield response
39
-
40
- demo = gr.ChatInterface(
41
- fn=predict,
42
- title="VisamIntelli-Flash",
43
- description="Your private AI brain on Hugging Face.",
44
- )
45
-
46
- # Create FastAPI app
47
  app = FastAPI()
48
 
49
- # Mount Gradio after defining demo
50
- app = gr.mount_gradio_app(app, demo, path="/")
 
51
 
52
  @app.post("/v1/chat/completions")
53
  async def chat_completions(request: Request):
@@ -67,7 +41,7 @@ async def chat_completions(request: Request):
67
  output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
68
  text = output['choices'][0]['text']
69
  return JSONResponse({
70
- "choices": [{"message": {"content": text}}]
71
  })
72
  else:
73
  def generate():
@@ -79,6 +53,31 @@ async def chat_completions(request: Request):
79
 
80
  return StreamingResponse(generate(), media_type="text/event-stream")
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  if __name__ == "__main__":
83
- import uvicorn
84
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
4
  import json
5
  from fastapi import FastAPI, Request
6
  from fastapi.responses import JSONResponse, StreamingResponse
7
+ import uvicorn
8
 
9
+ # 1. Load Model
10
  model_path = "model.gguf"
 
11
  print(f"Loading model from {model_path}...")
12
  llm = Llama(
13
  model_path=model_path,
 
16
  verbose=False
17
  )
18
 
19
+ # 2. FastAPI Setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  app = FastAPI()
21
 
22
+ @app.get("/health")
23
+ def health():
24
+ return {"status": "ok"}
25
 
26
  @app.post("/v1/chat/completions")
27
  async def chat_completions(request: Request):
 
41
  output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
42
  text = output['choices'][0]['text']
43
  return JSONResponse({
44
+ "choices": [{"message": {"content": text.strip()}}]
45
  })
46
  else:
47
  def generate():
 
53
 
54
  return StreamingResponse(generate(), media_type="text/event-stream")
55
 
56
+ # 3. Gradio UI Setup
57
+ def predict(message, history):
58
+ prompt = ""
59
+ for user_msg, assistant_msg in history:
60
+ prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
61
+ prompt += f"User: {message}\nAssistant:"
62
+
63
+ output = llm(prompt, max_tokens=1024, stop=["User:"], echo=False, stream=True)
64
+ response = ""
65
+ for chunk in output:
66
+ delta = chunk['choices'][0]['text']
67
+ response += delta
68
+ yield response
69
+
70
+ demo = gr.ChatInterface(
71
+ fn=predict,
72
+ title="VisamIntelli-Flash",
73
+ description="Your private AI brain on Hugging Face.",
74
+ )
75
+
76
+ # 4. Mount Gradio to FastAPI
77
+ # We mount it at / so it serves the UI at the root, but FastAPI routes take precedence if defined first?
78
+ # Actually, to be safe, let's mount Gradio at / and see if FastAPI works.
79
+ # If not, we'll use /ui for Gradio.
80
+ app = gr.mount_gradio_app(app, demo, path="/")
81
+
82
  if __name__ == "__main__":
 
83
  uvicorn.run(app, host="0.0.0.0", port=7860)