xTHExBEASTx commited on
Commit
b24c137
·
verified ·
1 Parent(s): db75d73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -9
app.py CHANGED
@@ -8,23 +8,25 @@ import uvicorn
8
 
9
  app = FastAPI()
10
 
11
- # 1. Start Ollama in the background
 
 
 
 
12
  subprocess.Popen(["ollama", "serve"])
13
 
14
  @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
15
  async def proxy_to_ollama(request: Request, path: str):
16
- # This forwards EVERYTHING to the local Ollama engine
17
  url = httpx.URL(f"http://127.0.0.1:11434/{path}", query=request.url.query.encode("utf-8"))
18
 
19
  async with httpx.AsyncClient(timeout=None) as client:
20
- # Build the request to forward
21
  req = client.build_request(
22
  request.method,
23
  url,
24
  headers=request.headers.raw,
25
  content=request.stream(),
26
  )
27
- # Send it and stream the response back (important for coding speed)
28
  resp = await client.send(req, stream=True)
29
  return StreamingResponse(
30
  resp.aiter_raw(),
@@ -32,11 +34,18 @@ async def proxy_to_ollama(request: Request, path: str):
32
  headers=dict(resp.headers)
33
  )
34
 
 
 
 
 
35
  if __name__ == "__main__":
36
- # 2. Wait for Ollama to wake up and pull the model
37
  time.sleep(5)
38
- subprocess.run(["ollama", "pull", "qwen2.5-coder:7b"])
39
 
40
- # 3. Start the web server on the port Hugging Face expects (7860)
41
- print("Bridge is active on port 7860. Forwarding to Ollama on 11434...")
42
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
8
 
9
  app = FastAPI()
10
 
11
+ # FIX 1: Increase Context Window to 16k to stop the "Truncation" error
12
+ # This allows the model to 'read' more of your project at once.
13
+ os.environ["OLLAMA_NUM_CTX"] = "16384"
14
+
15
+ # Start Ollama in the background
16
  subprocess.Popen(["ollama", "serve"])
17
 
18
  @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
19
  async def proxy_to_ollama(request: Request, path: str):
20
+ # Reverse Proxy: Forwards Claude CLI requests to the internal Ollama engine
21
  url = httpx.URL(f"http://127.0.0.1:11434/{path}", query=request.url.query.encode("utf-8"))
22
 
23
  async with httpx.AsyncClient(timeout=None) as client:
 
24
  req = client.build_request(
25
  request.method,
26
  url,
27
  headers=request.headers.raw,
28
  content=request.stream(),
29
  )
 
30
  resp = await client.send(req, stream=True)
31
  return StreamingResponse(
32
  resp.aiter_raw(),
 
34
  headers=dict(resp.headers)
35
  )
36
 
37
+ @app.get("/")
38
+ def health_check():
39
+ return {"status": "running", "model": "qwen2.5-coder:3b", "context": "16k"}
40
+
41
  if __name__ == "__main__":
42
+ # Wait for Ollama to wake up
43
  time.sleep(5)
 
44
 
45
+ # FIX 2: Switch to the 3B model
46
+ # It provides a 'snappy' response on CPU compared to the 14-minute 7B delay.
47
+ print("Pulling Qwen 2.5 Coder 3B...")
48
+ subprocess.run(["ollama", "pull", "qwen2.5-coder:3b"])
49
+
50
+ # Start the web server for Hugging Face
51
+ uvicorn.run(app, host="0.0.0.0", port=7860)