Spaces:

fermsi
/

mixtral7bx8-1k

Paused

App Files Files Community

mixtral7bx8-1k / app.py

fermsi

Enable public sharing with share=True

b67e272 8 months ago

raw

history blame contribute delete

24.3 kB

	import gradio as gr
	import torch
	import os
	import threading
	import time
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	try:
	# Use Rust-powered downloader for more reliable shard downloads in Spaces
	from huggingface_hub import hf_hub_enable_hf_transfer
	hf_hub_enable_hf_transfer()
	except Exception:
	pass
	import traceback

	# Use the fine-tuned Mixtral model (uploaded to Hugging Face Hub)
	MODEL_NAME = "fermsi/1k_na_llm_mixtral"

	model = None
	tokenizer = None
	loading_status = {"is_loading": False}

	def load_model():
	global model, tokenizer, loading_status

	if loading_status["is_loading"]:
	return False

	loading_status["is_loading"] = True
	print("🍺 Loading NA Beer Expert Mixtral Model with 4-bit quantization...")

	try:
	# Configure quantization properly for Mixtral
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16, # Better for Mixtral
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)

	# Optional auth token for private models
	hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
	# Improve reliability of hub downloads in Spaces
	os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
	os.environ.setdefault("HF_HUB_ENABLE_TELEMETRY", "0")
	model_load_kwargs = dict(
	quantization_config=quantization_config,
	device_map="auto",
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	torch_dtype=torch.bfloat16,
	resume_download=True,
	)
	if hf_token:
	model_load_kwargs["token"] = hf_token

	# Load with retries to handle transient S3/CDN hiccups
	last_err = None
	for attempt in range(1, 4):
	try:
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	**model_load_kwargs,
	)
	tokenizer_load_kwargs = dict(trust_remote_code=True, resume_download=True)
	if hf_token:
	tokenizer_load_kwargs["token"] = hf_token
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, **tokenizer_load_kwargs)
	last_err = None
	break
	except Exception as e:
	last_err = e
	wait_s = 2 ** attempt
	print(f"⚠️ Download attempt {attempt} failed: {e}. Retrying in {wait_s}s...")
	time.sleep(wait_s)
	if last_err is not None:
	raise last_err
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("✅ Mixtral model loaded successfully with 4-bit quantization!")
	loading_status["is_loading"] = False
	return True
	except Exception as e:
	print(f"❌ Model loading failed: {str(e)}")
	print(f"Full traceback: {traceback.format_exc()}")
	loading_status["is_loading"] = False
	return False

	def generate_response(message, history):
	global model, tokenizer, loading_status

	# Check if model is loaded
	if model is None:
	if loading_status["is_loading"]:
	return "🍺 Model is currently loading... Please wait a moment and try again."

	# Show immediate loading feedback
	loading_message = "🍺 Loading NA Beer Expert Mixtral model for the first time... This may take 3-5 minutes due to the large model size. Please be patient!"

	load_success = load_model()
	if not load_success:
	return "❌ Model failed to load. Please try again in a few minutes or contact support. This might be a temporary HuggingFace issue or the model upload may not have completed successfully."

	# Model loaded successfully, now generate response
	return f"✅ Model loaded successfully! Now processing your question: '{message}'\n\n" + _generate_actual_response(message, history)

	return _generate_actual_response(message, history)

	def _generate_actual_response(message, history):
	global model, tokenizer

	try:
	def needs_continuation(text: str) -> bool:
	if not text:
	return False
	tail = text.strip()[-1:]
	ended = tail in ".!?" or "pasteurization" in text.lower()
	return not ended

	# Simple anti-loop guard: if the last assistant message repeats the same first sentence
	# as the current draft, force a concise re-generation with different seed/temperature
	def looks_repetitive(draft: str, history_pairs) -> bool:
	if not draft or not history_pairs:
	return False
	last_pair = history_pairs[-1]
	if len(last_pair) < 2 or not last_pair[1]:
	return False
	prev = last_pair[1].split(".\n")[0][:120]
	cur = draft.split(".\n")[0][:120]
	return prev.strip().lower() == cur.strip().lower()

	# Create a proper prompt format for Mixtral Instruct
	expert_prompt = """
	You are Berkeley Yeast's assistant for non-alcoholic (NA) beer brewing.
	Speak as a practical brewing expert advising professional brewers (not as a scientist).
	Your scope is NA beer with Berkeley Yeast products only.

	Hard rules:
	- Discuss NA beer only when the user asks about NA beer or brewing with Berkeley Yeast. If the user only greets (e.g., "hi", "hello"), reply briefly and invite an NA brewing question.
	- When recommending yeast, suggest Berkeley Yeast NA strains only (NA Classic, NA Cabana). Do not recommend other brands.
	- Answer only what the user asked. If unclear, ask a 1‑line clarifying question before giving guidance.
	- Be concise by default: 4–8 bullet points or 5–8 short sentences. Avoid rambling.
	- Prefer numbers and ranges over prose. Keep outputs within the token budget.
	- If safety is relevant, emphasize pH targets and pasteurization succinctly.
	- Do not include generic introductions, disclaimers, or marketing prefaces. Start with the direct answer.
	- Do not provide unsolicited suggestions, example prompts, or topic lists. Only answer what was asked or ask a brief clarifying question.
	- Be blunt and pragmatic. If an approach is unsafe or counterproductive, say clearly that it is a bad idea and explain why. Never insult the user; critique the idea, not the person.
	- No sign‑offs or marketing. Do not add closing lines (e.g., "cheers"), generic key‑guideline sections, or contact info unless explicitly requested. Keep answers scoped strictly to the question.
	- NA Cabana and NA Classic must not be harvested, repitched, propagated, or reused. Always pitch a fresh pack; harvesting/propagation increases health risks in NA brewing.
	- If yeast freshness is in question: Do not recommend using packs older than 4 days from delivery. State clearly that using older liquid yeast is a bad idea for NA brewing quality and safety. Recommend contacting Berkeley Yeast for support/replacement and using a fresh pack. If unsure, say you are unsure and advise reaching out to Berkeley Yeast.
	- Tone: concise, direct, and slightly playful. When the user's message is casual/slangy, you may open with "sup bruh" or "sup duuude!" before delivering the technical guidance. Keep the slang minimal and never at the expense of clarity.
	- If the user asks about Berkeley Yeast products beyond NA: briefly acknowledge that Berkeley Yeast also offers normal‑strength yeasts with innovative technologies (e.g., Fresh and Tropics). Do not elaborate or market them; pivot back to NA guidance.
	- Draft service: do not recommend serving NA beer from kegs/draft. Recommend serving only from pasteurized cans or bottles before any additional guidance.

	Recipe presentation defaults (do not contradict):
	- Express malt bills as percentages that sum to 100%; do not output weights/volumes unless the user provides a batch size. If they ask for weights, ask their batch volume first and then scale.
	- Always include recommended OG and FG (or RDF) targets and expected ABV ≤0.5% for NA. Default targets: OG 5–7°P (1.020–1.028), RDF 10–20% (FG ≈ 3–5°P / 1.012–1.020 depending on OG).
	- Provide IBU ranges instead of fixed values unless the user specifies a target.
	- For step-by-step instructions, keep 6–8 numbered steps, never end mid‑sentence, and always include final cold crash, pH check (<4.2), packaging, and tunnel pasteurization as the last step. If close to length limits, compress wording rather than truncating.
	- For "how to" or "best process" questions, prefer 3–5 concise bullets or steps answering only the asked task. Do not append extra background or general safety reminders unless asked.

	Key Berkeley NA facts (use as needed; do not contradict):
	- Strains: Maltose-/maltotriose-negative; typical targets OG 5–7°P (1.020–1.028) and RDF 10–20% for ≤0.5% ABV.
	- Safety: Final pH <4.6 (ideally <4.2). Top controls are tunnel pasteurization (preferred) and pH control. Work with a process authority for validated PU targets.
	- Process: Short mash at high temp; hot-side acidification can help; ferment fast, crash cold; additions cold; avoid repitching.
	- Sensory: Use wheat/dextrin or maltodextrin for body; lower IBUs; late hot-side hops; hop extracts for aroma.
	- Packaging: Cans/bottles only; higher carbonation; avoid draft due to contamination risk.
	- Differentiator: Berkeley Yeast strains are genetically engineered to solve practical brewer problems (e.g., flavor, stability, cold‑chain, NA process). Keep it factual and concise; avoid marketing language.
	- Terminology: RDF stands for Real Degree of Fermentation.

	Mash guidance (defaults unless the user specifies otherwise):
	- Mash temperature: 160–165°F (71–74°C).
	- Mash time: 5–10 minutes to limit fermentability and control RDF.

	Acidification guidance:
	- Mash pH: Use food‑grade lactic acid or phosphoric acid to adjust mash pH (target ≈5.1; 4.8–5.0 also acceptable for safety).
	- Post‑fermentation pH: Use lactic acid, citric acid, or phosphoric acid to adjust final pH. Aim for final beer pH <4.6 (ideally <4.2; 3.7–4.0 common in NA). Additions should be metered, with sensory and pH verification.

	Flavor additions:
	- Bench-top first: run small-scale trials of flavors/extracts, pick the target intensity, then scale up using percentages or mL/L.
	- Addition point: add water‑soluble extracts to the brite tank post‑fermentation. Mix thoroughly using short CO₂ bursts to blend before packaging.

	Short-answer defaults (for brief topic-only questions; answer concisely without deflecting):
	- About Berkeley Yeast: a brewing biotech that engineers yeast strains to solve brewer problems. For NA specifically: use Berkeley NA Classic (clean/versatile) or NA Cabana (tropical), both maltose/maltotriose‑negative.
	- Hops: NA Classic (West Coast) → Citra + Mosaic late/whirlpool; dry hop 0.5–0.75 lb/bbl. NA Cabana (tropical) → Mosaic + Citra or Galaxy/Strata. Keep IBUs ≈25–35.
	- Mash: 160–165°F for 5–10 min; mash pH ≈5.1 (4.8–5.0 OK). Adjust with food‑grade lactic or phosphoric acid.
	- OG/FG/RDF/ABV: OG 5–7°P (1.020–1.028); RDF 10–20%; FG ≈3–5°P; ABV ≤0.5%.
	- Final pH: <4.6, ideally <4.2; adjust post‑fermentation with lactic, citric, or phosphoric.
	- Pasteurization: after filtration/packaging; validate PU targets with a process authority.
	- Filtration: filter before pasteurization (centrifuge, plate filter, Biofine) to avoid autolysis and off‑flavors.
	- Carbonation: typically 2.6–3.0 vol CO₂ for better NA sensory and lower risk of contamination.
	- Yeast: Use Berkeley NA Classic or NA Cabana; do not harvest/propagate; pitch fresh within 4 days of delivery.
	- Draft service: not recommended; serve only from pasteurized cans/bottles.

	Strain specifics:
	- NA Classic: Versatile base; authentic beer profile; suitable for complete beers or as a base layer.
	- NA Cabana: Emphasizes tropical notes; great for clear or hazy IPAs; engineered for enhanced flavor.

	Reference recipes (summarize only what is relevant to the question):
	- NA West Coast IPA (NA Classic): Mash 160°F (71°C) for 10 minutes (target mash pH ≈5.1). OG 1.024 (6°P); FG 1.020 (5°P); ~25–35 IBUs; pH ≤4.2; ABV ≤0.5%. Grist (percentages): 70% 2‑row, 20% wheat, 10% dextrin. Whirlpool and dry hop rates may be given in lb/bbl or g/hL when the user provides batch size.
	- NA Light Lager (NA Classic): Mash 160°F (71°C) for 10 minutes (target mash pH ≈5.1). Pre‑dilution OG 1.021 (5.3°P) → dilute to 1.016 (4.1°P); FG 1.013 (3°P); ~8–12 IBUs; pH ≤4.2; ABV ≤0.5%. Grist (percentages): 70% 2‑row, 15% Vienna, 15% dextrin. Hop schedule and exact weights are scaled only when batch volume is provided.

	Pasteurization:
	- Tunnel pasteurization is preferred after packaging. Establish PU targets with a process authority appropriate to product and pack; do not rely on unvalidated values.

	Health hazards to call out when relevant:
	- NA beer has higher microbial risk due to low ethanol/CO₂ and residual nutrients. Maintain sanitation, minimize post‑fermentation handling, keep cold, and validate with a process authority.

	Storage (only if the user asks about storing yeast; liquid yeast guidance only):
	- Use within 4 days of delivery.
	- De‑pressurize the container as needed: slowly loosen the cap to vent CO₂, then gently re‑tighten while CO₂ escapes to limit O₂ ingress.
	- Inspect yeast immediately upon delivery.

	Support:
	- For questions, guidance, or replacement, contact Berkeley Yeast support at (510)-900-7231.

	Packaging and shipping:
	- All liquid Berkeley Yeast ships in totes packaged with insulation and ice packs, designed for a comfortable 2‑day shipping window.

	Ordering:
	- To place an order: visit www.berkeleyyeast.com, email orders@berkeleyyeast.com, or call the mainline at (510)-900-7231.

	Filtration and clarification:
	- Prefer to filter NA beer before pasteurization. Rationale: if a large amount of yeast remains in suspension, pasteurization will kill the cells and can cause autolysis; filtration also removes hop particulates and other organics that can produce off‑flavors when heated.
	- Typical approaches: centrifuge, plate filtration, and finings such as Biofine (choose the method that fits your setup; maintain strict sanitation).
	- Sequence: complete filtration/clarification first, then pasteurize. Do not pasteurize before filtration.
	"""

	# System message MUST be first, then alternate user/assistant
	messages = [{"role": "system", "content": expert_prompt}]

	# Add conversation history pairs (ChatInterface provides (user, assistant))
	# Cap to the last N exchanges to avoid conversation drift/echo loops
	max_history_pairs = 6
	capped_history = history[-max_history_pairs:] if history else []
	for user_msg, assistant_msg in capped_history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add current user message as the last message before generation
	messages.append({"role": "user", "content": message})

	# Apply chat template
	conversation = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Guard: if message is greeting or small talk, keep it ultra-brief
	normalized = message.strip().lower()
	greeting_like = {
	"hi", "hello", "hey", "yo", "howdy", "greetings", "sup", "sup bruh",
	"good morning", "good afternoon", "good evening", "hey dude", "hey bro",
	}
	if any(normalized == g or normalized.startswith(g + "!") for g in greeting_like) or normalized in {"hi how are you", "how are you", "hey there"}:
	return "Hey—what NA brewing question can I help with today?"

	# Remove special-cased hop reply; rely on system prompt short-answer defaults

	# If the user hasn't asked anything actionable, ask one concise clarifier
	if len(normalized.split()) <= 4 and not any(k in normalized for k in ["na", "non-alcoholic", "berkeley", "recipe", "yeast", "og", "ibu", "mash", "cabana", "classic", "hop"]):
	return "What NA brewing topic should we tackle—recipe, process, pH, filtration, or pasteurization?"

	# Short identity response
	if any(p in normalized for p in ["who are you", "who r u", "what are you", "who is this"]):
	return "I'm Berkeley Yeast's assistant for NA beer brewing. Ask your NA brewing question."

	# Duration answers covered by system prompt defaults; avoid hardcoded returns here

	# Guard: yeast older than 4 days → explicitly discourage and route to support
	if "yeast" in normalized and any(t in normalized for t in [
	"month", "months", "week", "weeks", "older than", "old yeast", "1 month", "30 days", "45 days", "2 months", "3 months", "8 days", "10 days", "stale"
	]):
	return (
	"Not recommended. Liquid yeast should be used within 4 days of delivery for NA brewing. "
	"Using older yeast risks poor fermentation and safety issues. Contact Berkeley Yeast support at (510)-900-7231 for guidance or replacement, and pitch a fresh pack."
	)

	# Tokenize input
	inputs = tokenizer(
	conversation,
	return_tensors="pt",
	truncation=True,
	max_length=4096 # Mixtral can handle longer contexts
	)

	# Move to device
	if torch.cuda.is_available():
	inputs = inputs.to("cuda")

	# Generate response with Mixtral-optimized parameters
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=420,
	temperature=0.5,
	do_sample=True,
	top_p=0.85,
	top_k=40,
	repetition_penalty=1.2,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	# Decode response
	full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract just the assistant's response
	if "[/INST]" in full_response:
	# Mixtral format
	response = full_response.split("[/INST]")[-1].strip()
	elif "<\|im_start\|>assistant" in full_response:
	# Alternative format
	response = full_response.split("<\|im_start\|>assistant")[-1].replace("<\|im_end\|>", "").strip()
	else:
	# Fallback: remove the input conversation
	response = full_response[len(conversation):].strip()

	# Clean up any artifacts
	response = response.replace("<\|endoftext\|>", "").strip()
	response = response.replace("</s>", "").strip()

	# If the response appears identical to the prior assistant message, make a brief re-try
	if looks_repetitive(response, capped_history):
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=320,
	temperature=0.6,
	do_sample=True,
	top_p=0.9,
	top_k=60,
	repetition_penalty=1.25,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	if "[/INST]" in response:
	response = response.split("[/INST]")[-1].strip()
	elif "<\|im_start\|>assistant" in response:
	response = response.split("<\|im_start\|>assistant")[-1].replace("<\|im_end\|>", "").strip()
	else:
	response = response[len(conversation):].strip()
	response = response.replace("<\|endoftext\|>", "").replace("</s>", "").strip()

	# If response likely cut off, request a short continuation to complete steps
	if needs_continuation(response):
	continuation_messages = messages + [
	{"role": "assistant", "content": response},
	{"role": "user", "content": "Continue the previous answer concisely. Finish any remaining numbered steps and end with cold crash, pH check (<4.2), packaging, and tunnel pasteurization."},
	]
	continuation_conv = tokenizer.apply_chat_template(
	continuation_messages,
	tokenize=False,
	add_generation_prompt=True,
	)
	cont_inputs = tokenizer(
	continuation_conv,
	return_tensors="pt",
	truncation=True,
	max_length=4096,
	)
	if torch.cuda.is_available():
	cont_inputs = cont_inputs.to("cuda")
	with torch.no_grad():
	cont_outputs = model.generate(
	**cont_inputs,
	max_new_tokens=160,
	temperature=0.45,
	do_sample=True,
	top_p=0.85,
	top_k=30,
	repetition_penalty=1.15,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)
	cont_text = tokenizer.decode(cont_outputs[0], skip_special_tokens=True)
	if "[/INST]" in cont_text:
	cont_text = cont_text.split("[/INST]")[-1].strip()
	elif "<\|im_start\|>assistant" in cont_text:
	cont_text = cont_text.split("<\|im_start\|>assistant")[-1].replace("<\|im_end\|>", "").strip()
	else:
	cont_text = cont_text[len(continuation_conv):].strip()
	cont_text = cont_text.replace("<\|endoftext\|>", "").replace("</s>", "").strip()
	response = (response + "\n" + cont_text).strip()

	print(f"Generated response: {response[:100]}...") # Debug log

	return response

	except Exception as e:
	error_msg = f"Generation error: {str(e)}"
	print(f"❌ {error_msg}")
	print(f"Full traceback: {traceback.format_exc()}")
	return f"Sorry, I encountered an error generating a response: {error_msg}"

	with gr.Blocks(title="Berkeley Yeast N‑AI", theme=gr.themes.Soft()) as demo:
	# Readable gradient header (replace small ASCII text)
	gr.HTML(
	"""
	<style>
	.header-wrap { display:flex; align-items:center; justify-content:center; padding: 16px 0 6px; }
	.header-wrap h1 {
	margin:0; font-size: 28px; font-weight: 800; letter-spacing: 0.4px;
	background: linear-gradient(90deg, #bbf7d0, #34d399, #065f46);
	-webkit-background-clip: text; -webkit-text-fill-color: transparent; color: transparent;
	font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji", "Segoe UI Emoji";
	text-align:center;
	}
	.header-wrap p {
	margin: 6px 0 0; color: #9ca3af; font-size: 13px; text-align: center;
	}
	</style>
	<div class="header-wrap">
	<div>
	<h1>Berkeley Yeast N‑AI</h1>
	<p>Non‑Alcoholic brewing assistant</p>
	</div>
	</div>
	"""
	)

	gr.ChatInterface(
	generate_response
	)

	# Warm start: pre-load model in background when the Space boots to avoid first-request delay
	try:
	threading.Thread(target=load_model, daemon=True).start()
	except Exception:
	# If background preload fails, the model will still load on first request
	pass

	if __name__ == "__main__":
	demo.launch(share=True)