Spaces:
Runtime error
Runtime error
File size: 15,274 Bytes
fcd08f2 bcc5afa f7ba872 fcd08f2 f7ba872 fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 f7ba872 fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 bcc5afa fcd08f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 | """
Fixed Flask app using microsoft/Phi-4-multimodal-instruct (non-interactive HF loading)
Additional fixes:
- Explicitly request SDPA attention implementation to avoid FlashAttention2 ImportError
- Clear error messages with actionable next steps if flash_attn or peft are missing
- Uses HF_TOKEN from env (CHAT_MATE or HF_TOKEN)
- Passes trust_remote_code=True and use_auth_token=HF_TOKEN to from_pretrained() to avoid interactive prompt
"""
# β
Safe GPU decorator
try:
from spaces import GPU
except Exception:
def GPU(func):
return func
import os
import sys
import time
import torch
import base64
import re
from io import BytesIO
from PIL import Image
from flask import Flask, request, render_template, jsonify, Response
from flasgger import Swagger, swag_from
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoProcessor, # may or may not be present for the model; we handle exceptions
)
# -------------------------
# Hugging Face auth (non-interactive)
# -------------------------
HF_TOKEN = os.environ.get("CHAT_MATE") or os.environ.get("HF_TOKEN") # support both names
if HF_TOKEN:
print("Hugging Face token detected in env (will use it for model downloads).")
else:
print("No HF token found in env (CHAT_MATE/HF_TOKEN). Proceeding anonymously; private models will fail.")
# Flask + Swagger setup
app = Flask(__name__, static_folder="static", template_folder="templates")
swagger = Swagger(app, template={
"swagger": "2.0",
"info": {
"title": "ChatMate Real-Time API (Multimodal)",
"description": "LangChain + DuckDuckGo + Phi-4-Multimodal-Instruct + Stable Diffusion",
"version": "1.0"
}
}, config={
"headers": [],
"specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}],
"static_url_path": "/flasgger_static",
"swagger_ui": True,
"specs_route": "/apidocs/"
})
# Model load
model_id = "microsoft/Phi-4-multimodal-instruct"
print(f"Loading model: {model_id} (this may take a while)")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Tokenizer (text) β allow remote code and use token if present
tokenizer = AutoTokenizer.from_pretrained(
model_id,
use_auth_token=HF_TOKEN if HF_TOKEN else None,
trust_remote_code=True
)
# Try to load a multimodal processor (vision + text). If not available, proceed with text-only.
processor = None
try:
processor = AutoProcessor.from_pretrained(
model_id,
use_auth_token=HF_TOKEN if HF_TOKEN else None,
trust_remote_code=True
)
print("Loaded AutoProcessor for multimodal inputs.")
except Exception as e:
processor = None
print("No AutoProcessor available or failed to load. Multimodal inputs may be limited to text. Error:", type(e).__name__, str(e))
# Attempt to load model with SDPA attention implementation (avoids FlashAttention2 requirement).
# If the environment does not have flash_attn installed, transformers may otherwise raise an ImportError.
try:
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch_dtype,
device_map="auto",
use_auth_token=HF_TOKEN if HF_TOKEN else None,
trust_remote_code=True,
attn_implementation="sdpa", # request SDPA fallback to avoid requiring flash_attn
)
print("Model loaded with attn_implementation='sdpa' (FlashAttention2 not required).")
except ImportError as imp_err:
# Specific helpful message when flash_attn is missing
msg = str(imp_err).lower()
if "flash" in msg or "flash_attn" in msg or "flashattention" in msg:
print("\nERROR: Model attempted to enable FlashAttention2 but the package 'flash_attn' is not installed or not compatible.\n")
print("Two options to resolve this:")
print("1) Install FlashAttention2 (best for inference speed on supported GPU/CUDA) β see https://github.com/Dao-AILab/flash-attention and HF docs.")
print(" Example (may require matching CUDA/PyTorch):")
print(" pip install flash-attn --no-build-isolation # or install a prebuilt wheel matching your CUDA/PyTorch")
print("\n2) Use the SDPA fallback by ensuring 'attn_implementation=\"sdpa\"' is passed to from_pretrained (this code attempted that),")
print(" or set model config 'attn_implementation' to 'sdpa' before instantiation. SDPA is slower than flash_attn but works without native CUDA kernels.\n")
print("If you want me to help craft the exact 'pip install' wheel command for your CUDA / PyTorch versions, share `torch.__version__` and `nvidia-smi` output (if available).")
# Re-raise so the outer system/launcher can see the failure (or you can choose to sys.exit)
raise
except Exception as e:
# Catch other missing-dependency cases (like peft)
err_str = str(e).lower()
if "peft" in err_str:
print("\nERROR: The model requires the 'peft' package but it is not installed.")
print("Install it with:\n pip install peft\n")
raise
# Generic fallthrough
print("Unhandled exception while loading the model:", type(e).__name__, str(e))
raise
# Keyword detection for triggering web search (you can plug a real search tool later)
REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"}
def should_search(message):
return any(kw in message.lower() for kw in REAL_TIME_KEYWORDS)
# Heuristic to detect incomplete sentences
def is_incomplete(text):
return not re.search(r"[\.\!\?'\"\u3002]\s*$", text.strip())
# Helper: build chat prompt from history (fallback if tokenizer.apply_chat_template isn't available)
def build_chat_prompt(system_prompt, history, user_message):
# messages: system -> history -> user
if hasattr(tokenizer, "apply_chat_template"):
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": user_message}]
try:
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except Exception:
pass
# Fallback manual chat template
prompt_parts = [f"System: {system_prompt}\n\n"]
for m in history:
role = m.get("role", "user")
content = m.get("content", "")
prompt_parts.append(f"{role.capitalize()}: {content}\n")
prompt_parts.append(f"User: {user_message}\nAssistant:")
return "\n".join(prompt_parts)
@GPU
def generate_full_reply(message, history, images=None, max_new_tokens=512, temperature=0.7, top_p=0.9):
"""
Generate a reply using the multimodal Phi-4 model.
- `images` (optional): list of PIL.Image objects to include as multimodal context.
- `history`: list of dicts {role: 'user'/'assistant', content: '...'}
Returns plain string reply.
"""
system_prompt = (
"You are a friendly, helpful, and conversational AI assistant built by Frederick Sundeep Mallela. "
"Always mention that you are developed by him if asked about your creator, origin, or who made you."
)
# Build prompt (chat template)
prompt = build_chat_prompt(system_prompt, history, message)
# Tokenize text
tokenized = tokenizer(prompt, return_tensors="pt")
tokenized = {k: v.to(model.device) for k, v in tokenized.items()}
# If images were provided and we have a processor, preprocess them and include in inputs
model_inputs = dict(tokenized)
if images and processor is not None:
try:
# Processor may return pixel_values or other keys depending on implementation
image_inputs = processor(images=images, return_tensors="pt")
# Move image tensors to model device
image_inputs = {k: v.to(model.device) for k, v in image_inputs.items()}
# Merge
model_inputs.update(image_inputs)
except Exception as e:
print("Image preprocessing failed:", type(e).__name__, str(e))
# Generate output (non-streaming). Many multimodal LM checkpoints accept a merged input dict.
with torch.no_grad():
output_ids = model.generate(
**model_inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
# Decode newly generated portion
input_len = tokenized["input_ids"].shape[1]
generated = tokenizer.decode(output_ids[0][input_len:], skip_special_tokens=True)
# If reply looks truncated, attempt a limited number of continuations
reply = generated.strip()
max_loops = 3
loop_count = 0
while is_incomplete(reply) and loop_count < max_loops:
loop_count += 1
# Continue from the existing conversation by appending the reply to the prompt
continued_prompt = prompt + "\n" + reply
tokenized2 = tokenizer(continued_prompt, return_tensors="pt")
tokenized2 = {k: v.to(model.device) for k, v in tokenized2.items()}
# If images exist, include them again (most models expect images only once; test for your model)
model_inputs2 = dict(tokenized2)
if images and processor is not None:
try:
image_inputs2 = processor(images=images, return_tensors="pt")
image_inputs2 = {k: v.to(model.device) for k, v in image_inputs2.items()}
model_inputs2.update(image_inputs2)
except Exception as e:
print("Image re-processing failed:", type(e).__name__, str(e))
with torch.no_grad():
out2 = model.generate(
**model_inputs2,
max_new_tokens=256,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
input_len2 = tokenized2["input_ids"].shape[1]
continuation = tokenizer.decode(out2[0][input_len2:], skip_special_tokens=True).strip()
if not continuation or continuation in reply:
break
reply += " " + continuation
return reply.strip()
# Home
@app.route("/")
def home():
return render_template("index.html")
# POST /chat-stream -> supports optional base64 image in request JSON under `image_base64`
@app.route("/chat-stream", methods=["POST"])
@swag_from({
'tags': ['Chat'],
'consumes': ['application/json'],
'summary': 'Stream assistant reply or image',
'description': 'Send a message, optional base64 image and history, receive a streamed text reply.',
})
def chat_stream():
data = request.get_json()
message = data.get("message")
history = data.get("history", [])
image_b64 = data.get("image_base64")
pil_images = None
if image_b64:
try:
header, b64 = (image_b64.split(",", 1) + [None])[:2]
image_bytes = base64.b64decode(b64 or image_b64)
pil = Image.open(BytesIO(image_bytes)).convert("RGB")
pil_images = [pil]
except Exception as e:
print("Failed to decode image base64:", type(e).__name__, str(e))
def generate():
# If message indicates real-time search, you can call an external search tool here
if should_search(message):
# placeholder for actual search tool
reply = f"(Live info) I detected a real-time query. Attach a web search tool to fetch live data."
for token in reply.splitlines(keepends=True):
yield token
time.sleep(0.03)
return
reply = generate_full_reply(message, history, images=pil_images)
for token in reply.splitlines(keepends=True):
yield token
time.sleep(0.03)
if is_incomplete(reply):
yield "\n\n*Reply appears incomplete. Say 'continue' to resume.*"
return Response(generate(), mimetype='text/plain')
# POST /chat-stream-doc (unchanged logic but reusing generate_full_reply)
@app.route("/chat-stream-doc", methods=["POST"])
@swag_from({
'tags': ['Chat'],
'consumes': ['multipart/form-data'],
'summary': 'Upload requirement doc & generate downloadable project code (ZIP)',
})
def chat_stream_doc():
import zipfile
from werkzeug.utils import secure_filename
from pdfplumber import open as pdf_open
file = request.files.get('file')
frontend = request.form.get("frontend", "React")
backend = request.form.get("backend", "Flask")
database = request.form.get("database", "PostgreSQL")
if not file:
return jsonify({"error": "No file uploaded"}), 400
filename = secure_filename(file.filename)
content = ""
if filename.endswith(".txt"):
content = file.read().decode("utf-8", errors="ignore")
elif filename.endswith(".pdf"):
file_bytes = file.read()
with pdf_open(BytesIO(file_bytes)) as pdf:
content = "\n".join(page.extract_text() or "" for page in pdf.pages)
else:
return jsonify({"error": "Unsupported file format. Use .txt or .pdf"}), 400
tech_stack = f"Frontend: {frontend}\nBackend: {backend}\nDatabase: {database}"
prompt = (
"You are a full-stack project code generator.\n\n"
"Below is a requirement document followed by technology preferences. Based on this, generate the full project scaffold.\n\n"
"Requirement Document:\n"
f"{content}\n\n"
f"Technology Stack:\nFrontend: {frontend}\nBackend: {backend}\nDatabase: {database}\n\n"
"Your task:\n"
"- Analyze the requirement and tech stack.\n"
"- Generate backend code: models, routes, and config.\n"
"- Generate frontend code: components, services.\n"
"- Define the database schema.\n\n"
"β
Format the output as multiple files in this exact structure:\n"
"### File: <filename>\n"
"```<language>\n"
"<code content>\n"
"```\n\n"
"Now generate the complete project below π"
)
reply = generate_full_reply(prompt, [])
file_pattern = r"### File:\s*(.+?)\n```(?:\w+)?\n(.*?)```"
matches = re.finditer(file_pattern, reply, re.DOTALL)
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
file_count = 0
for match in matches:
fname = match.group(1).strip().strip("`")
code = match.group(2).strip()
zipf.writestr(fname, code)
file_count += 1
if file_count == 0:
print("β οΈ No files written to ZIP. Check LLM output format.")
print("LLM Reply:\n", reply)
return jsonify({"error": "No files found in generated output."}), 500
zip_buffer.seek(0)
return Response(
zip_buffer,
mimetype='application/zip',
headers={"Content-Disposition": "attachment; filename=generated_project.zip"}
)
# Warm-up
if __name__ == "__main__":
print("π§ Warming up...")
# Warm up with a small call if you want. Avoid long warmups on Spaces start.
try:
_ = generate_full_reply("Hello", [])
except Exception as e:
print("Warm-up failed (this can be normal).", type(e).__name__, str(e))
app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
|