Spaces:
Sleeping
Sleeping
File size: 6,660 Bytes
6e7cdea c87585c 6e7cdea e483a33 e937e73 7655952 6e7cdea ef0659d ac30b3e 6e7cdea feab4b5 6e7cdea 2638098 6e7cdea c87585c e937e73 c87585c e937e73 e483a33 8774e66 e483a33 decfb00 c87585c 3c6674b decfb00 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | import os
import time
import threading
import torch
import gradio as gr
from flask import Flask, request, Response
from flasgger import Swagger, swag_from
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from langchain_community.tools import DuckDuckGoSearchRun
import re
from fastapi import FastAPI
from starlette.middleware.wsgi import WSGIMiddleware
from gradio.routes import mount_gradio_app
# β
Safe GPU decorator
try:
from spaces import GPU
except ImportError:
def GPU(func): return func
# β
Flask setup
flask_app = Flask(__name__)
swagger = Swagger(flask_app, template={
"swagger": "2.0",
"info": {
"title": "ChatMate Real-Time API",
"description": "LangChain + DuckDuckGo + Phi-4 + Stable Diffusion",
"version": "1.0"
}
}, config={
"headers": [],
"specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}],
"static_url_path": "/flasgger_static",
"swagger_ui": True,
"specs_route": "/api/apidocs/"
})
# β
Hugging Face login (optional)
login(token=os.environ.get("CHAT_MATE"))
# β
Load Phi-4
model_id = "microsoft/phi-4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=device,
max_new_tokens=512
)
REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"}
search_tool = DuckDuckGoSearchRun()
def should_search(message):
return any(kw in message.lower() for kw in REAL_TIME_KEYWORDS)
def is_incomplete(text):
return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip())
@GPU
def generate_full_reply(message, history):
system_prompt = (
"You are a friendly, helpful, and conversational AI assistant built by "
"Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you."
)
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
# Apply chat-style prompt formatting
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Initial generation
full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"]
reply = full_output[len(prompt):].strip()
# Keep extending the reply until it ends properly
max_loops = 5 # prevent infinite loops
loop_count = 0
while is_incomplete(reply) and loop_count < max_loops:
loop_count += 1
continuation_prompt = prompt + reply # include reply so far
next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"]
continuation = next_output[len(continuation_prompt):].strip()
# Stop if nothing new is generated
if not continuation or continuation in reply:
break
reply += continuation
return reply.strip()
# β
Flask streaming endpoint
@flask_app.route("/chat-stream", methods=["POST"])
@swag_from({
'tags': ['Chat'],
'consumes': ['application/json'],
'summary': 'Stream assistant reply or image',
'description': 'Send a message and history, receive either a streamed text reply or base64-encoded image.',
'parameters': [{
'name': 'body',
'in': 'body',
'required': True,
'schema': {
'type': 'object',
'properties': {
'message': {'type': 'string', 'example': 'Draw a futuristic city.'},
'history': {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'role': {'type': 'string', 'example': 'user'},
'content': {'type': 'string', 'example': 'Show me a dragon.'}
}
}
}
},
'required': ['message']
}
}],
'responses': {
200: {
'description': 'Streamed reply or image base64',
'content': {'text/plain': {}}
}
}
})
def chat_stream():
data = request.get_json()
message = data.get("message")
history = data.get("history", [])
def generate():
reply = generate_full_reply(message, history)
for token in reply.splitlines(keepends=True):
yield token
time.sleep(0.05)
if is_incomplete(reply):
yield "\n\n*Reply appears incomplete. Say 'continue' to resume.*"
return Response(generate(), mimetype='text/plain')
# β
Gradio interface for Hugging Face Space
def gradio_chat(message, history=[]):
history = [{"role": "user" if i % 2 == 0 else "assistant", "content": h} for i, h in enumerate(sum(history, ()))]
reply = generate_full_reply(message, history)
history.append((message, reply))
return "", history
with gr.Blocks() as demo:
gr.Markdown("## π€ ChatMate β Phi-4 + Live Search (Hugging Face Space)")
chatbot = gr.Chatbot()
msg = gr.Textbox(label="Type your message")
clear = gr.Button("Clear Chat")
msg.submit(gradio_chat, [msg, chatbot], [msg, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
# β
Run Gradio when in HF Spaces, else Flask for local dev
# if __name__ == "__main__":
# if os.environ.get("SPACE_BUILD", "false").lower() == "true":
# demo.launch(server_name="0.0.0.0", server_port=7860)
# else:
# print("π§ Warming up...")
# _ = generate_full_reply("Hello", [])
# app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
# ---------------- Run both ----------------
# def run_flask():
# app.run(host="0.0.0.0", port=8000)
# # Start Flask in a background thread
# threading.Thread(target=run_flask, daemon=True).start()
# ---------------- Combine Flask + Gradio into one app ----------------
fastapi_app = FastAPI()
# Mount Flask under FastAPI (so /apidocs works)
fastapi_app.mount("/api", WSGIMiddleware(flask_app))
# Mount Gradio at root path (overrides Flask's "/")
app = mount_gradio_app(fastapi_app, demo, path="/") # Mount Flask under /flask
# Gradio runs on port 7860 in HF Spaces
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |