Spaces:

FrederickSundeep
/

AlphaMate

Runtime error

App Files Files Community

AlphaMate / app.py

FrederickSundeep

commit initial 03-12-2025 006

f7ba872 5 months ago

raw

history blame contribute delete

15.3 kB

	"""
	Fixed Flask app using microsoft/Phi-4-multimodal-instruct (non-interactive HF loading)
	Additional fixes:
	- Explicitly request SDPA attention implementation to avoid FlashAttention2 ImportError
	- Clear error messages with actionable next steps if flash_attn or peft are missing
	- Uses HF_TOKEN from env (CHAT_MATE or HF_TOKEN)
	- Passes trust_remote_code=True and use_auth_token=HF_TOKEN to from_pretrained() to avoid interactive prompt
	"""

	# ✅ Safe GPU decorator
	try:
	from spaces import GPU
	except Exception:
	def GPU(func):
	return func

	import os
	import sys
	import time
	import torch
	import base64
	import re
	from io import BytesIO
	from PIL import Image
	from flask import Flask, request, render_template, jsonify, Response
	from flasgger import Swagger, swag_from
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoProcessor, # may or may not be present for the model; we handle exceptions
	)

	# -------------------------
	# Hugging Face auth (non-interactive)
	# -------------------------
	HF_TOKEN = os.environ.get("CHAT_MATE") or os.environ.get("HF_TOKEN") # support both names
	if HF_TOKEN:
	print("Hugging Face token detected in env (will use it for model downloads).")
	else:
	print("No HF token found in env (CHAT_MATE/HF_TOKEN). Proceeding anonymously; private models will fail.")

	# Flask + Swagger setup
	app = Flask(__name__, static_folder="static", template_folder="templates")
	swagger = Swagger(app, template={
	"swagger": "2.0",
	"info": {
	"title": "ChatMate Real-Time API (Multimodal)",
	"description": "LangChain + DuckDuckGo + Phi-4-Multimodal-Instruct + Stable Diffusion",
	"version": "1.0"
	}
	}, config={
	"headers": [],
	"specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}],
	"static_url_path": "/flasgger_static",
	"swagger_ui": True,
	"specs_route": "/apidocs/"
	})

	# Model load
	model_id = "microsoft/Phi-4-multimodal-instruct"
	print(f"Loading model: {model_id} (this may take a while)")

	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Tokenizer (text) — allow remote code and use token if present
	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	use_auth_token=HF_TOKEN if HF_TOKEN else None,
	trust_remote_code=True
	)

	# Try to load a multimodal processor (vision + text). If not available, proceed with text-only.
	processor = None
	try:
	processor = AutoProcessor.from_pretrained(
	model_id,
	use_auth_token=HF_TOKEN if HF_TOKEN else None,
	trust_remote_code=True
	)
	print("Loaded AutoProcessor for multimodal inputs.")
	except Exception as e:
	processor = None
	print("No AutoProcessor available or failed to load. Multimodal inputs may be limited to text. Error:", type(e).__name__, str(e))

	# Attempt to load model with SDPA attention implementation (avoids FlashAttention2 requirement).
	# If the environment does not have flash_attn installed, transformers may otherwise raise an ImportError.
	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	device_map="auto",
	use_auth_token=HF_TOKEN if HF_TOKEN else None,
	trust_remote_code=True,
	attn_implementation="sdpa", # request SDPA fallback to avoid requiring flash_attn
	)
	print("Model loaded with attn_implementation='sdpa' (FlashAttention2 not required).")
	except ImportError as imp_err:
	# Specific helpful message when flash_attn is missing
	msg = str(imp_err).lower()
	if "flash" in msg or "flash_attn" in msg or "flashattention" in msg:
	print("\nERROR: Model attempted to enable FlashAttention2 but the package 'flash_attn' is not installed or not compatible.\n")
	print("Two options to resolve this:")
	print("1) Install FlashAttention2 (best for inference speed on supported GPU/CUDA) — see https://github.com/Dao-AILab/flash-attention and HF docs.")
	print(" Example (may require matching CUDA/PyTorch):")
	print(" pip install flash-attn --no-build-isolation # or install a prebuilt wheel matching your CUDA/PyTorch")
	print("\n2) Use the SDPA fallback by ensuring 'attn_implementation=\"sdpa\"' is passed to from_pretrained (this code attempted that),")
	print(" or set model config 'attn_implementation' to 'sdpa' before instantiation. SDPA is slower than flash_attn but works without native CUDA kernels.\n")
	print("If you want me to help craft the exact 'pip install' wheel command for your CUDA / PyTorch versions, share `torch.__version__` and `nvidia-smi` output (if available).")
	# Re-raise so the outer system/launcher can see the failure (or you can choose to sys.exit)
	raise

	except Exception as e:
	# Catch other missing-dependency cases (like peft)
	err_str = str(e).lower()
	if "peft" in err_str:
	print("\nERROR: The model requires the 'peft' package but it is not installed.")
	print("Install it with:\n pip install peft\n")
	raise
	# Generic fallthrough
	print("Unhandled exception while loading the model:", type(e).__name__, str(e))
	raise

	# Keyword detection for triggering web search (you can plug a real search tool later)
	REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"}

	def should_search(message):
	return any(kw in message.lower() for kw in REAL_TIME_KEYWORDS)

	# Heuristic to detect incomplete sentences
	def is_incomplete(text):
	return not re.search(r"[\.\!\?'\"\u3002]\s*$", text.strip())

	# Helper: build chat prompt from history (fallback if tokenizer.apply_chat_template isn't available)
	def build_chat_prompt(system_prompt, history, user_message):
	# messages: system -> history -> user
	if hasattr(tokenizer, "apply_chat_template"):
	messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": user_message}]
	try:
	return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except Exception:
	pass

	# Fallback manual chat template
	prompt_parts = [f"System: {system_prompt}\n\n"]
	for m in history:
	role = m.get("role", "user")
	content = m.get("content", "")
	prompt_parts.append(f"{role.capitalize()}: {content}\n")
	prompt_parts.append(f"User: {user_message}\nAssistant:")
	return "\n".join(prompt_parts)


	@GPU
	def generate_full_reply(message, history, images=None, max_new_tokens=512, temperature=0.7, top_p=0.9):
	"""
	Generate a reply using the multimodal Phi-4 model.

	- `images` (optional): list of PIL.Image objects to include as multimodal context.
	- `history`: list of dicts {role: 'user'/'assistant', content: '...'}

	Returns plain string reply.
	"""
	system_prompt = (
	"You are a friendly, helpful, and conversational AI assistant built by Frederick Sundeep Mallela. "
	"Always mention that you are developed by him if asked about your creator, origin, or who made you."
	)

	# Build prompt (chat template)
	prompt = build_chat_prompt(system_prompt, history, message)

	# Tokenize text
	tokenized = tokenizer(prompt, return_tensors="pt")
	tokenized = {k: v.to(model.device) for k, v in tokenized.items()}

	# If images were provided and we have a processor, preprocess them and include in inputs
	model_inputs = dict(tokenized)
	if images and processor is not None:
	try:
	# Processor may return pixel_values or other keys depending on implementation
	image_inputs = processor(images=images, return_tensors="pt")
	# Move image tensors to model device
	image_inputs = {k: v.to(model.device) for k, v in image_inputs.items()}
	# Merge
	model_inputs.update(image_inputs)
	except Exception as e:
	print("Image preprocessing failed:", type(e).__name__, str(e))

	# Generate output (non-streaming). Many multimodal LM checkpoints accept a merged input dict.
	with torch.no_grad():
	output_ids = model.generate(
	**model_inputs,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	)

	# Decode newly generated portion
	input_len = tokenized["input_ids"].shape[1]
	generated = tokenizer.decode(output_ids[0][input_len:], skip_special_tokens=True)

	# If reply looks truncated, attempt a limited number of continuations
	reply = generated.strip()
	max_loops = 3
	loop_count = 0
	while is_incomplete(reply) and loop_count < max_loops:
	loop_count += 1
	# Continue from the existing conversation by appending the reply to the prompt
	continued_prompt = prompt + "\n" + reply
	tokenized2 = tokenizer(continued_prompt, return_tensors="pt")
	tokenized2 = {k: v.to(model.device) for k, v in tokenized2.items()}

	# If images exist, include them again (most models expect images only once; test for your model)
	model_inputs2 = dict(tokenized2)
	if images and processor is not None:
	try:
	image_inputs2 = processor(images=images, return_tensors="pt")
	image_inputs2 = {k: v.to(model.device) for k, v in image_inputs2.items()}
	model_inputs2.update(image_inputs2)
	except Exception as e:
	print("Image re-processing failed:", type(e).__name__, str(e))

	with torch.no_grad():
	out2 = model.generate(
	**model_inputs2,
	max_new_tokens=256,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	)

	input_len2 = tokenized2["input_ids"].shape[1]
	continuation = tokenizer.decode(out2[0][input_len2:], skip_special_tokens=True).strip()
	if not continuation or continuation in reply:
	break
	reply += " " + continuation

	return reply.strip()


	# Home
	@app.route("/")
	def home():
	return render_template("index.html")


	# POST /chat-stream -> supports optional base64 image in request JSON under `image_base64`
	@app.route("/chat-stream", methods=["POST"])
	@swag_from({
	'tags': ['Chat'],
	'consumes': ['application/json'],
	'summary': 'Stream assistant reply or image',
	'description': 'Send a message, optional base64 image and history, receive a streamed text reply.',
	})
	def chat_stream():
	data = request.get_json()
	message = data.get("message")
	history = data.get("history", [])
	image_b64 = data.get("image_base64")

	pil_images = None
	if image_b64:
	try:
	header, b64 = (image_b64.split(",", 1) + [None])[:2]
	image_bytes = base64.b64decode(b64 or image_b64)
	pil = Image.open(BytesIO(image_bytes)).convert("RGB")
	pil_images = [pil]
	except Exception as e:
	print("Failed to decode image base64:", type(e).__name__, str(e))

	def generate():
	# If message indicates real-time search, you can call an external search tool here
	if should_search(message):
	# placeholder for actual search tool
	reply = f"(Live info) I detected a real-time query. Attach a web search tool to fetch live data."
	for token in reply.splitlines(keepends=True):
	yield token
	time.sleep(0.03)
	return

	reply = generate_full_reply(message, history, images=pil_images)
	for token in reply.splitlines(keepends=True):
	yield token
	time.sleep(0.03)

	if is_incomplete(reply):
	yield "\n\nReply appears incomplete. Say 'continue' to resume."

	return Response(generate(), mimetype='text/plain')


	# POST /chat-stream-doc (unchanged logic but reusing generate_full_reply)
	@app.route("/chat-stream-doc", methods=["POST"])
	@swag_from({
	'tags': ['Chat'],
	'consumes': ['multipart/form-data'],
	'summary': 'Upload requirement doc & generate downloadable project code (ZIP)',
	})
	def chat_stream_doc():
	import zipfile
	from werkzeug.utils import secure_filename
	from pdfplumber import open as pdf_open

	file = request.files.get('file')
	frontend = request.form.get("frontend", "React")
	backend = request.form.get("backend", "Flask")
	database = request.form.get("database", "PostgreSQL")

	if not file:
	return jsonify({"error": "No file uploaded"}), 400

	filename = secure_filename(file.filename)
	content = ""

	if filename.endswith(".txt"):
	content = file.read().decode("utf-8", errors="ignore")
	elif filename.endswith(".pdf"):
	file_bytes = file.read()
	with pdf_open(BytesIO(file_bytes)) as pdf:
	content = "\n".join(page.extract_text() or "" for page in pdf.pages)
	else:
	return jsonify({"error": "Unsupported file format. Use .txt or .pdf"}), 400

	tech_stack = f"Frontend: {frontend}\nBackend: {backend}\nDatabase: {database}"
	prompt = (
	"You are a full-stack project code generator.\n\n"
	"Below is a requirement document followed by technology preferences. Based on this, generate the full project scaffold.\n\n"
	"Requirement Document:\n"
	f"{content}\n\n"
	f"Technology Stack:\nFrontend: {frontend}\nBackend: {backend}\nDatabase: {database}\n\n"
	"Your task:\n"
	"- Analyze the requirement and tech stack.\n"
	"- Generate backend code: models, routes, and config.\n"
	"- Generate frontend code: components, services.\n"
	"- Define the database schema.\n\n"
	"✅ Format the output as multiple files in this exact structure:\n"
	"### File: <filename>\n"
	"```<language>\n"
	"<code content>\n"
	"```\n\n"
	"Now generate the complete project below 👇"
	)

	reply = generate_full_reply(prompt, [])

	file_pattern = r"### File:\s(.+?)\n```(?:\w+)?\n(.?)```"
	matches = re.finditer(file_pattern, reply, re.DOTALL)

	zip_buffer = BytesIO()
	with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
	file_count = 0
	for match in matches:
	fname = match.group(1).strip().strip("`")
	code = match.group(2).strip()
	zipf.writestr(fname, code)
	file_count += 1

	if file_count == 0:
	print("⚠️ No files written to ZIP. Check LLM output format.")
	print("LLM Reply:\n", reply)
	return jsonify({"error": "No files found in generated output."}), 500

	zip_buffer.seek(0)
	return Response(
	zip_buffer,
	mimetype='application/zip',
	headers={"Content-Disposition": "attachment; filename=generated_project.zip"}
	)


	# Warm-up
	if __name__ == "__main__":
	print("🔧 Warming up...")
	# Warm up with a small call if you want. Avoid long warmups on Spaces start.
	try:
	_ = generate_full_reply("Hello", [])
	except Exception as e:
	print("Warm-up failed (this can be normal).", type(e).__name__, str(e))

	app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))