import os import time import threading import torch import gradio as gr from flask import Flask, request, Response from flasgger import Swagger, swag_from from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from huggingface_hub import login from langchain_community.tools import DuckDuckGoSearchRun import re from fastapi import FastAPI from starlette.middleware.wsgi import WSGIMiddleware from gradio.routes import mount_gradio_app # ✅ Safe GPU decorator try: from spaces import GPU except ImportError: def GPU(func): return func # ✅ Flask setup flask_app = Flask(__name__) swagger = Swagger(flask_app, template={ "swagger": "2.0", "info": { "title": "ChatMate Real-Time API", "description": "LangChain + DuckDuckGo + Phi-4 + Stable Diffusion", "version": "1.0" } }, config={ "headers": [], "specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}], "static_url_path": "/flasgger_static", "swagger_ui": True, "specs_route": "/api/apidocs/" }) # ✅ Hugging Face login (optional) login(token=os.environ.get("CHAT_MATE")) # ✅ Load Phi-4 model_id = "microsoft/phi-4" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) device = 0 if torch.cuda.is_available() else -1 pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=device, max_new_tokens=512 ) REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"} search_tool = DuckDuckGoSearchRun() def should_search(message): return any(kw in message.lower() for kw in REAL_TIME_KEYWORDS) def is_incomplete(text): return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip()) @GPU def generate_full_reply(message, history): system_prompt = ( "You are a friendly, helpful, and conversational AI assistant built by " "Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you." ) messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}] # Apply chat-style prompt formatting prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Initial generation full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"] reply = full_output[len(prompt):].strip() # Keep extending the reply until it ends properly max_loops = 5 # prevent infinite loops loop_count = 0 while is_incomplete(reply) and loop_count < max_loops: loop_count += 1 continuation_prompt = prompt + reply # include reply so far next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"] continuation = next_output[len(continuation_prompt):].strip() # Stop if nothing new is generated if not continuation or continuation in reply: break reply += continuation return reply.strip() # ✅ Flask streaming endpoint @flask_app.route("/chat-stream", methods=["POST"]) @swag_from({ 'tags': ['Chat'], 'consumes': ['application/json'], 'summary': 'Stream assistant reply or image', 'description': 'Send a message and history, receive either a streamed text reply or base64-encoded image.', 'parameters': [{ 'name': 'body', 'in': 'body', 'required': True, 'schema': { 'type': 'object', 'properties': { 'message': {'type': 'string', 'example': 'Draw a futuristic city.'}, 'history': { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'role': {'type': 'string', 'example': 'user'}, 'content': {'type': 'string', 'example': 'Show me a dragon.'} } } } }, 'required': ['message'] } }], 'responses': { 200: { 'description': 'Streamed reply or image base64', 'content': {'text/plain': {}} } } }) def chat_stream(): data = request.get_json() message = data.get("message") history = data.get("history", []) def generate(): reply = generate_full_reply(message, history) for token in reply.splitlines(keepends=True): yield token time.sleep(0.05) if is_incomplete(reply): yield "\n\n*Reply appears incomplete. Say 'continue' to resume.*" return Response(generate(), mimetype='text/plain') # ✅ Gradio interface for Hugging Face Space def gradio_chat(message, history=[]): history = [{"role": "user" if i % 2 == 0 else "assistant", "content": h} for i, h in enumerate(sum(history, ()))] reply = generate_full_reply(message, history) history.append((message, reply)) return "", history with gr.Blocks() as demo: gr.Markdown("## 🤖 ChatMate — Phi-4 + Live Search (Hugging Face Space)") chatbot = gr.Chatbot() msg = gr.Textbox(label="Type your message") clear = gr.Button("Clear Chat") msg.submit(gradio_chat, [msg, chatbot], [msg, chatbot]) clear.click(lambda: None, None, chatbot, queue=False) # ✅ Run Gradio when in HF Spaces, else Flask for local dev # if __name__ == "__main__": # if os.environ.get("SPACE_BUILD", "false").lower() == "true": # demo.launch(server_name="0.0.0.0", server_port=7860) # else: # print("🔧 Warming up...") # _ = generate_full_reply("Hello", []) # app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860))) # ---------------- Run both ---------------- # def run_flask(): # app.run(host="0.0.0.0", port=8000) # # Start Flask in a background thread # threading.Thread(target=run_flask, daemon=True).start() # ---------------- Combine Flask + Gradio into one app ---------------- fastapi_app = FastAPI() # Mount Flask under FastAPI (so /apidocs works) fastapi_app.mount("/api", WSGIMiddleware(flask_app)) # Mount Gradio at root path (overrides Flask's "/") app = mount_gradio_app(fastapi_app, demo, path="/") # Mount Flask under /flask # Gradio runs on port 7860 in HF Spaces if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)