Spaces:

xTHExBEASTx
/

my-private-coder

Sleeping

App Files Files Community

my-private-coder / app.py

xTHExBEASTx

Update app.py

784bc56 verified 22 days ago

raw

history blame contribute delete

1.72 kB

	import os
	import subprocess
	import time
	import httpx
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse
	import uvicorn

	app = FastAPI()

	# FIX 1: Increase Context Window to 16k to stop the "Truncation" error
	# This allows the model to 'read' more of your project at once.
	os.environ["OLLAMA_NUM_CTX"] = "16384"

	# Start Ollama in the background
	subprocess.Popen(["ollama", "serve"])

	@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
	async def proxy_to_ollama(request: Request, path: str):
	# This captures the '/v1/messages' and sends it to Ollama
	# Ensure no extra '/v1' is added by the CLI
	target_path = path if not path.startswith("v1/v1") else path.replace("v1/v1", "v1")
	url = httpx.URL(f"http://127.0.0.1:11434/{target_path}", query=request.url.query.encode("utf-8"))

	async with httpx.AsyncClient(timeout=None) as client:
	req = client.build_request(
	request.method, url, headers=request.headers.raw, content=request.stream()
	)
	resp = await client.send(req, stream=True)
	return StreamingResponse(resp.aiter_raw(), status_code=resp.status_code, headers=dict(resp.headers))

	@app.get("/")
	def health_check():
	return {"status": "running", "model": "qwen2.5-coder:3b", "context": "16k"}

	if __name__ == "__main__":
	# Wait for Ollama to wake up
	time.sleep(5)

	# FIX 2: Switch to the 3B model
	# It provides a 'snappy' response on CPU compared to the 14-minute 7B delay.
	print("Pulling Qwen 2.5 Coder 3B...")
	subprocess.run(["ollama", "pull", "qwen2.5-coder:3b"])

	# Start the web server for Hugging Face
	uvicorn.run(app, host="0.0.0.0", port=7860)