Spaces:

FrederickSundeep
/

AlphaMate

Runtime error

File size: 15,274 Bytes

"""
Fixed Flask app using microsoft/Phi-4-multimodal-instruct (non-interactive HF loading)
Additional fixes:
- Explicitly request SDPA attention implementation to avoid FlashAttention2 ImportError
- Clear error messages with actionable next steps if flash_attn or peft are missing
- Uses HF_TOKEN from env (CHAT_MATE or HF_TOKEN)
- Passes trust_remote_code=True and use_auth_token=HF_TOKEN to from_pretrained() to avoid interactive prompt
"""

# ✅ Safe GPU decorator
try:
    from spaces import GPU
except Exception:
    def GPU(func):
        return func

import os
import sys
import time
import torch
import base64
import re
from io import BytesIO
from PIL import Image
from flask import Flask, request, render_template, jsonify, Response
from flasgger import Swagger, swag_from
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoProcessor,  # may or may not be present for the model; we handle exceptions
)

# -------------------------
# Hugging Face auth (non-interactive)
# -------------------------
HF_TOKEN = os.environ.get("CHAT_MATE") or os.environ.get("HF_TOKEN")  # support both names
if HF_TOKEN:
    print("Hugging Face token detected in env (will use it for model downloads).")
else:
    print("No HF token found in env (CHAT_MATE/HF_TOKEN). Proceeding anonymously; private models will fail.")

# Flask + Swagger setup
app = Flask(__name__, static_folder="static", template_folder="templates")
swagger = Swagger(app, template={
    "swagger": "2.0",
    "info": {
        "title": "ChatMate Real-Time API (Multimodal)",
        "description": "LangChain + DuckDuckGo + Phi-4-Multimodal-Instruct + Stable Diffusion",
        "version": "1.0"
    }
}, config={
    "headers": [],
    "specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/apidocs/"
})

# Model load
model_id = "microsoft/Phi-4-multimodal-instruct"
print(f"Loading model: {model_id} (this may take a while)")

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Tokenizer (text) — allow remote code and use token if present
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=HF_TOKEN if HF_TOKEN else None,
    trust_remote_code=True
)

# Try to load a multimodal processor (vision + text). If not available, proceed with text-only.
processor = None
try:
    processor = AutoProcessor.from_pretrained(
        model_id,
        use_auth_token=HF_TOKEN if HF_TOKEN else None,
        trust_remote_code=True
    )
    print("Loaded AutoProcessor for multimodal inputs.")
except Exception as e:
    processor = None
    print("No AutoProcessor available or failed to load. Multimodal inputs may be limited to text. Error:", type(e).__name__, str(e))

# Attempt to load model with SDPA attention implementation (avoids FlashAttention2 requirement).
# If the environment does not have flash_attn installed, transformers may otherwise raise an ImportError.
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        device_map="auto",
        use_auth_token=HF_TOKEN if HF_TOKEN else None,
        trust_remote_code=True,
        attn_implementation="sdpa",   # request SDPA fallback to avoid requiring flash_attn
    )
    print("Model loaded with attn_implementation='sdpa' (FlashAttention2 not required).")
except ImportError as imp_err:
    # Specific helpful message when flash_attn is missing
    msg = str(imp_err).lower()
    if "flash" in msg or "flash_attn" in msg or "flashattention" in msg:
        print("\nERROR: Model attempted to enable FlashAttention2 but the package 'flash_attn' is not installed or not compatible.\n")
        print("Two options to resolve this:")
        print("1) Install FlashAttention2 (best for inference speed on supported GPU/CUDA) — see https://github.com/Dao-AILab/flash-attention and HF docs.")
        print("   Example (may require matching CUDA/PyTorch):")
        print("     pip install flash-attn --no-build-isolation   # or install a prebuilt wheel matching your CUDA/PyTorch")
        print("\n2) Use the SDPA fallback by ensuring 'attn_implementation=\"sdpa\"' is passed to from_pretrained (this code attempted that),")
        print("   or set model config 'attn_implementation' to 'sdpa' before instantiation. SDPA is slower than flash_attn but works without native CUDA kernels.\n")
        print("If you want me to help craft the exact 'pip install' wheel command for your CUDA / PyTorch versions, share `torch.__version__` and `nvidia-smi` output (if available).")
    # Re-raise so the outer system/launcher can see the failure (or you can choose to sys.exit)
    raise

except Exception as e:
    # Catch other missing-dependency cases (like peft)
    err_str = str(e).lower()
    if "peft" in err_str:
        print("\nERROR: The model requires the 'peft' package but it is not installed.")
        print("Install it with:\n    pip install peft\n")
        raise
    # Generic fallthrough
    print("Unhandled exception while loading the model:", type(e).__name__, str(e))
    raise

# Keyword detection for triggering web search (you can plug a real search tool later)
REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"}

def should_search(message):
    return any(kw in message.lower() for kw in REAL_TIME_KEYWORDS)

# Heuristic to detect incomplete sentences
def is_incomplete(text):
    return not re.search(r"[\.\!\?'\"\u3002]\s*$", text.strip())

# Helper: build chat prompt from history (fallback if tokenizer.apply_chat_template isn't available)
def build_chat_prompt(system_prompt, history, user_message):
    # messages: system -> history -> user
    if hasattr(tokenizer, "apply_chat_template"):
        messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": user_message}]
        try:
            return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except Exception:
            pass

    # Fallback manual chat template
    prompt_parts = [f"System: {system_prompt}\n\n"]
    for m in history:
        role = m.get("role", "user")
        content = m.get("content", "")
        prompt_parts.append(f"{role.capitalize()}: {content}\n")
    prompt_parts.append(f"User: {user_message}\nAssistant:")
    return "\n".join(prompt_parts)


@GPU
def generate_full_reply(message, history, images=None, max_new_tokens=512, temperature=0.7, top_p=0.9):
    """
    Generate a reply using the multimodal Phi-4 model.

    - `images` (optional): list of PIL.Image objects to include as multimodal context.
    - `history`: list of dicts {role: 'user'/'assistant', content: '...'}

    Returns plain string reply.
    """
    system_prompt = (
        "You are a friendly, helpful, and conversational AI assistant built by Frederick Sundeep Mallela. "
        "Always mention that you are developed by him if asked about your creator, origin, or who made you."
    )

    # Build prompt (chat template)
    prompt = build_chat_prompt(system_prompt, history, message)

    # Tokenize text
    tokenized = tokenizer(prompt, return_tensors="pt")
    tokenized = {k: v.to(model.device) for k, v in tokenized.items()}

    # If images were provided and we have a processor, preprocess them and include in inputs
    model_inputs = dict(tokenized)
    if images and processor is not None:
        try:
            # Processor may return pixel_values or other keys depending on implementation
            image_inputs = processor(images=images, return_tensors="pt")
            # Move image tensors to model device
            image_inputs = {k: v.to(model.device) for k, v in image_inputs.items()}
            # Merge
            model_inputs.update(image_inputs)
        except Exception as e:
            print("Image preprocessing failed:", type(e).__name__, str(e))

    # Generate output (non-streaming). Many multimodal LM checkpoints accept a merged input dict.
    with torch.no_grad():
        output_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )

    # Decode newly generated portion
    input_len = tokenized["input_ids"].shape[1]
    generated = tokenizer.decode(output_ids[0][input_len:], skip_special_tokens=True)

    # If reply looks truncated, attempt a limited number of continuations
    reply = generated.strip()
    max_loops = 3
    loop_count = 0
    while is_incomplete(reply) and loop_count < max_loops:
        loop_count += 1
        # Continue from the existing conversation by appending the reply to the prompt
        continued_prompt = prompt + "\n" + reply
        tokenized2 = tokenizer(continued_prompt, return_tensors="pt")
        tokenized2 = {k: v.to(model.device) for k, v in tokenized2.items()}

        # If images exist, include them again (most models expect images only once; test for your model)
        model_inputs2 = dict(tokenized2)
        if images and processor is not None:
            try:
                image_inputs2 = processor(images=images, return_tensors="pt")
                image_inputs2 = {k: v.to(model.device) for k, v in image_inputs2.items()}
                model_inputs2.update(image_inputs2)
            except Exception as e:
                print("Image re-processing failed:", type(e).__name__, str(e))

        with torch.no_grad():
            out2 = model.generate(
                **model_inputs2,
                max_new_tokens=256,
                do_sample=True,
                temperature=temperature,
                top_p=top_p,
            )

        input_len2 = tokenized2["input_ids"].shape[1]
        continuation = tokenizer.decode(out2[0][input_len2:], skip_special_tokens=True).strip()
        if not continuation or continuation in reply:
            break
        reply += " " + continuation

    return reply.strip()


# Home
@app.route("/")
def home():
    return render_template("index.html")


# POST /chat-stream -> supports optional base64 image in request JSON under `image_base64`
@app.route("/chat-stream", methods=["POST"]) 
@swag_from({
    'tags': ['Chat'],
    'consumes': ['application/json'],
    'summary': 'Stream assistant reply or image',
    'description': 'Send a message, optional base64 image and history, receive a streamed text reply.',
})
def chat_stream():
    data = request.get_json()
    message = data.get("message")
    history = data.get("history", [])
    image_b64 = data.get("image_base64")

    pil_images = None
    if image_b64:
        try:
            header, b64 = (image_b64.split(",", 1) + [None])[:2]
            image_bytes = base64.b64decode(b64 or image_b64)
            pil = Image.open(BytesIO(image_bytes)).convert("RGB")
            pil_images = [pil]
        except Exception as e:
            print("Failed to decode image base64:", type(e).__name__, str(e))

    def generate():
        # If message indicates real-time search, you can call an external search tool here
        if should_search(message):
            # placeholder for actual search tool
            reply = f"(Live info) I detected a real-time query. Attach a web search tool to fetch live data." 
            for token in reply.splitlines(keepends=True):
                yield token
                time.sleep(0.03)
            return

        reply = generate_full_reply(message, history, images=pil_images)
        for token in reply.splitlines(keepends=True):
            yield token
            time.sleep(0.03)

        if is_incomplete(reply):
            yield "\n\n*Reply appears incomplete. Say 'continue' to resume.*"

    return Response(generate(), mimetype='text/plain')


# POST /chat-stream-doc (unchanged logic but reusing generate_full_reply)
@app.route("/chat-stream-doc", methods=["POST"]) 
@swag_from({
    'tags': ['Chat'],
    'consumes': ['multipart/form-data'],
    'summary': 'Upload requirement doc & generate downloadable project code (ZIP)',
})
def chat_stream_doc():
    import zipfile
    from werkzeug.utils import secure_filename
    from pdfplumber import open as pdf_open

    file = request.files.get('file')
    frontend = request.form.get("frontend", "React")
    backend = request.form.get("backend", "Flask")
    database = request.form.get("database", "PostgreSQL")

    if not file:
        return jsonify({"error": "No file uploaded"}), 400

    filename = secure_filename(file.filename)
    content = ""

    if filename.endswith(".txt"):
        content = file.read().decode("utf-8", errors="ignore")
    elif filename.endswith(".pdf"):
        file_bytes = file.read()
        with pdf_open(BytesIO(file_bytes)) as pdf:
            content = "\n".join(page.extract_text() or "" for page in pdf.pages)
    else:
        return jsonify({"error": "Unsupported file format. Use .txt or .pdf"}), 400

    tech_stack = f"Frontend: {frontend}\nBackend: {backend}\nDatabase: {database}"
    prompt = (
        "You are a full-stack project code generator.\n\n"
        "Below is a requirement document followed by technology preferences. Based on this, generate the full project scaffold.\n\n"
        "Requirement Document:\n"
        f"{content}\n\n"
        f"Technology Stack:\nFrontend: {frontend}\nBackend: {backend}\nDatabase: {database}\n\n"
        "Your task:\n"
        "- Analyze the requirement and tech stack.\n"
        "- Generate backend code: models, routes, and config.\n"
        "- Generate frontend code: components, services.\n"
        "- Define the database schema.\n\n"
        "✅ Format the output as multiple files in this exact structure:\n"
        "### File: <filename>\n" 
        "```<language>\n"
        "<code content>\n"
        "```\n\n"
        "Now generate the complete project below 👇"
    )

    reply = generate_full_reply(prompt, [])

    file_pattern = r"### File:\s*(.+?)\n```(?:\w+)?\n(.*?)```"
    matches = re.finditer(file_pattern, reply, re.DOTALL)

    zip_buffer = BytesIO()
    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
        file_count = 0
        for match in matches:
            fname = match.group(1).strip().strip("`")
            code = match.group(2).strip()
            zipf.writestr(fname, code)
            file_count += 1

    if file_count == 0:
        print("⚠️ No files written to ZIP. Check LLM output format.")
        print("LLM Reply:\n", reply)
        return jsonify({"error": "No files found in generated output."}), 500

    zip_buffer.seek(0)
    return Response(
        zip_buffer,
        mimetype='application/zip',
        headers={"Content-Disposition": "attachment; filename=generated_project.zip"}
    )


# Warm-up
if __name__ == "__main__":
    print("🔧 Warming up...")
    # Warm up with a small call if you want. Avoid long warmups on Spaces start.
    try:
        _ = generate_full_reply("Hello", [])
    except Exception as e:
        print("Warm-up failed (this can be normal).", type(e).__name__, str(e))

    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))