Spaces:

Euryeth
/

LLM_Ariphes

Runtime error

App Files Files Community

Euryeth commited on Jun 8, 2025

Commit

a5e8a2b

verified ·

1 Parent(s): c9c300e

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -21

app.py CHANGED Viewed

@@ -11,19 +11,21 @@ import gradio as gr
 login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 # Token authentication for requests
-API_TOKEN = os.getenv("HF_API_TOKEN")  # You set this in Space secrets
 # Set up model loading and pipeline
 torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 os.environ['HF_HOME'] = '/tmp/cache'
 model_name = "cerebras/btlm-3b-8k-chat"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch_dtype,
     device_map="auto",
-    trust_remote_code=True
 )
 generator = pipeline(
@@ -36,7 +38,6 @@ generator = pipeline(
     trust_remote_code=True
 )
-# Flask app
 app = Flask(__name__)
 @app.route("/")
@@ -45,7 +46,6 @@ def home():
 @app.route("/v1/chat/completions", methods=["POST"])
 def chat():
-    # Token auth: require Bearer token
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer ") or auth_header.split(" ")[1] != API_TOKEN:
         return jsonify({"error": "Unauthorized"}), 401
@@ -56,7 +56,6 @@ def chat():
     temperature = data.get("temperature", 0.7)
     stream = data.get("stream", False)
-    # Build the prompt from chat history
     prompt = ""
     for msg in messages:
         role = msg.get("role", "user").capitalize()
@@ -64,7 +63,6 @@ def chat():
         prompt += f"{role}: {content}\n"
     prompt += "Assistant:"
-    # If stream = True, stream response like OpenAI
     if stream:
         def generate_stream():
             output = generator(
@@ -97,7 +95,6 @@ def chat():
         return Response(generate_stream(), content_type="text/event-stream")
-    # Non-streamed response
     output = generator(
         prompt,
         max_new_tokens=max_tokens,
@@ -109,24 +106,21 @@ def chat():
     reply = output[0]["generated_text"].replace(prompt, "").strip()
     return jsonify({
-        "choices": [
-            {
-                "message": {
-                    "role": "assistant",
-                    "content": reply
-                },
-                "finish_reason": "stop",
-                "index": 0
-            }
-        ]
     })
-# Optional Gradio frontend to keep Hugging Face Space active
 with gr.Blocks() as demo:
     gr.Markdown("### LLM backend is running and ready for API calls.")
 demo.launch()
 if __name__ == "__main__":
-    # Listen on port 8080 as required by HF Spaces
     app.run(host="0.0.0.0", port=8080)

 login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
 # Token authentication for requests
+API_TOKEN = os.getenv("HF_API_TOKEN")
 # Set up model loading and pipeline
 torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 os.environ['HF_HOME'] = '/tmp/cache'
 model_name = "cerebras/btlm-3b-8k-chat"
+revision = "main"  # Pin to stable revision
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, revision=revision)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch_dtype,
     device_map="auto",
+    trust_remote_code=True,
+    revision=revision
 )
 generator = pipeline(
     trust_remote_code=True
 )
 app = Flask(__name__)
 @app.route("/")
 @app.route("/v1/chat/completions", methods=["POST"])
 def chat():
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer ") or auth_header.split(" ")[1] != API_TOKEN:
         return jsonify({"error": "Unauthorized"}), 401
     temperature = data.get("temperature", 0.7)
     stream = data.get("stream", False)
     prompt = ""
     for msg in messages:
         role = msg.get("role", "user").capitalize()
         prompt += f"{role}: {content}\n"
     prompt += "Assistant:"
     if stream:
         def generate_stream():
             output = generator(
         return Response(generate_stream(), content_type="text/event-stream")
     output = generator(
         prompt,
         max_new_tokens=max_tokens,
     reply = output[0]["generated_text"].replace(prompt, "").strip()
     return jsonify({
+        "choices": [{
+            "message": {
+                "role": "assistant",
+                "content": reply
+            },
+            "finish_reason": "stop",
+            "index": 0
+        }]
     })
+# Optional Gradio frontend to keep Space alive
 with gr.Blocks() as demo:
     gr.Markdown("### LLM backend is running and ready for API calls.")
 demo.launch()
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=8080)