Spaces:

visamram02
/

VisamIntelli-Flash

Sleeping

App Files Files Community

visamram02 commited on Mar 16

Commit

ac0a230

verified ·

1 Parent(s): 321dc65

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +33 -34

app.py CHANGED Viewed

@@ -4,11 +4,10 @@ import os
 import json
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, StreamingResponse
-import threading
-# Download model if not exists
 model_path = "model.gguf"
 print(f"Loading model from {model_path}...")
 llm = Llama(
     model_path=model_path,
@@ -17,37 +16,12 @@ llm = Llama(
     verbose=False
 )
-def predict(message, history):
-    prompt = ""
-    for user_msg, assistant_msg in history:
-        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
-    prompt += f"User: {message}\nAssistant:"
-    output = llm(
-        prompt,
-        max_tokens=512,
-        stop=["User:"],
-        echo=False,
-        stream=True
-    )
-    response = ""
-    for chunk in output:
-        delta = chunk['choices'][0]['text']
-        response += delta
-        yield response
-demo = gr.ChatInterface(
-    fn=predict,
-    title="VisamIntelli-Flash",
-    description="Your private AI brain on Hugging Face.",
-)
-# Create FastAPI app
 app = FastAPI()
-# Mount Gradio after defining demo
-app = gr.mount_gradio_app(app, demo, path="/")
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
@@ -67,7 +41,7 @@ async def chat_completions(request: Request):
         output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
         text = output['choices'][0]['text']
         return JSONResponse({
-            "choices": [{"message": {"content": text}}]
         })
     else:
         def generate():
@@ -79,6 +53,31 @@ async def chat_completions(request: Request):
         return StreamingResponse(generate(), media_type="text/event-stream")
 if __name__ == "__main__":
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import json
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, StreamingResponse
+import uvicorn
+# 1. Load Model
 model_path = "model.gguf"
 print(f"Loading model from {model_path}...")
 llm = Llama(
     model_path=model_path,
     verbose=False
 )
+# 2. FastAPI Setup
 app = FastAPI()
+@app.get("/health")
+def health():
+    return {"status": "ok"}
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
         output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
         text = output['choices'][0]['text']
         return JSONResponse({
+            "choices": [{"message": {"content": text.strip()}}]
         })
     else:
         def generate():
         return StreamingResponse(generate(), media_type="text/event-stream")
+# 3. Gradio UI Setup
+def predict(message, history):
+    prompt = ""
+    for user_msg, assistant_msg in history:
+        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
+    prompt += f"User: {message}\nAssistant:"
+    output = llm(prompt, max_tokens=1024, stop=["User:"], echo=False, stream=True)
+    response = ""
+    for chunk in output:
+        delta = chunk['choices'][0]['text']
+        response += delta
+        yield response
+demo = gr.ChatInterface(
+    fn=predict,
+    title="VisamIntelli-Flash",
+    description="Your private AI brain on Hugging Face.",
+)
+# 4. Mount Gradio to FastAPI
+# We mount it at / so it serves the UI at the root, but FastAPI routes take precedence if defined first?
+# Actually, to be safe, let's mount Gradio at / and see if FastAPI works.
+# If not, we'll use /ui for Gradio.
+app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)