Spaces:

visamram02
/

VisamIntelli-Flash

Sleeping

App Files Files Community

VisamIntelli-Flash / app.py

visamram02

Upload folder using huggingface_hub

6299d73 verified 2 months ago

raw

history blame contribute delete

2.61 kB

	import gradio as gr
	from llama_cpp import Llama
	import os
	import json
	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse, StreamingResponse, RedirectResponse
	import uvicorn

	# 1. Load Model
	model_path = "model.gguf"
	print(f"Loading model from {model_path}...")
	llm = Llama(
	model_path=model_path,
	n_ctx=1024, # Drastically reduced context size (saves memory/time on CPU)
	n_threads=8, # Maximize all available vCPUs
	n_threads_batch=8, # Speed up prompt processing
	n_batch=256, # Optimize batch size for prompt evaluation
	verbose=False
	)

	# 2. FastAPI Setup
	app = FastAPI()

	@app.get("/")
	def read_root():
	return RedirectResponse(url="/ui")

	@app.get("/health")
	def health():
	return {"status": "ok"}

	@app.post("/v1/chat/completions")
	async def chat_completions(request: Request):
	data = await request.json()
	messages = data.get("messages", [])
	stream = data.get("stream", False)

	# Simple prompt builder
	prompt = ""
	for m in messages:
	role = m.get("role", "user")
	content = m.get("content", "")
	prompt += f"{role.capitalize()}: {content}\n"
	prompt += "Assistant:"

	if not stream:
	output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024)
	text = output['choices'][0]['text']
	return JSONResponse({
	"choices": [{"message": {"content": text.strip()}}]
	})
	else:
	def generate():
	output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024, stream=True)
	for chunk in output:
	text = chunk['choices'][0]['text']
	yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
	yield "data: [DONE]\n\n"

	return StreamingResponse(generate(), media_type="text/event-stream")

	# 3. Gradio UI Setup
	def predict(message, history):
	prompt = ""
	for user_msg, assistant_msg in history:
	prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
	prompt += f"User: {message}\nAssistant:"

	output = llm(prompt, max_tokens=1024, stop=["User:"], echo=False, stream=True)
	response = ""
	for chunk in output:
	delta = chunk['choices'][0]['text']
	response += delta
	yield response

	demo = gr.ChatInterface(
	fn=predict,
	title="VisamIntelli-Flash",
	description="Your private AI brain on Hugging Face.",
	)

	# 4. Mount Gradio to FastAPI at /ui
	app = gr.mount_gradio_app(app, demo, path="/ui")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)