Spaces:

rishu834763
/

javacode_explainer

Runtime error

App Files Files Community

rishu834763 commited on Nov 22, 2025

Commit

cd50342

verified ·

1 Parent(s): dc1ec25

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -24

app.py CHANGED Viewed

@@ -1,39 +1,85 @@
-# app.py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline  # ← pipeline is here!
 from peft import PeftModel
 import gradio as gr
-# ===================================
-BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Open, no gate!
 LORA_ADAPTER = "rishu834763/java-explainer-lora"
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
 )
-print("Loading Llama-3.1-8B-Instruct 4-bit + your LoRA...")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=quantization_config,
-    device_map="auto",
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 )
 model = PeftModel.from_pretrained(base_model, LORA_ADAPTER)
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
-# ← FIXED: pipeline from transformers, not torch
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
-    max_new_tokens=1024,
     temperature=0.3,
     top_p=0.95,
     do_sample=True,
@@ -41,32 +87,45 @@ pipe = pipeline(
     return_full_text=False,
 )
-SYSTEM_PROMPT = "You are an expert Java teacher. Explain concepts clearly with code examples."
 def chat(message: str, history):
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    for user, assistant in history:
-        messages.append({"role": "user", "content": user})
-        if assistant:
-            messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": message})
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    outputs = pipe(prompt)
-    return outputs[0]["generated_text"]
-# ===================================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Java Explainer\nPowered by your LoRA on Llama-3.1-8B-Instruct (4-bit)")
-    chatbot = gr.Chatbot(height=620)
-    msg = gr.Textbox(placeholder="Ask anything about Java...", container=False)
     with gr.Row():
-        send = gr.Button("Send 🚀", variant="primary")
-        clear = gr.Button("Clear")
     send.click(chat, [msg, chatbot], [msg, chatbot]).then(lambda: "", outputs=msg)
     msg.submit(chat, [msg, chatbot], [msg, chatbot]).then(lambda: "", outputs=msg)
     clear.click(lambda: None, None, chatbot, queue=False)
-demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=7860)

+# app.py - Fixed for Low VRAM (November 2025, T4-Compatible)
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
 from peft import PeftModel
 import gradio as gr
+# Exact base for your LoRA
+BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
 LORA_ADAPTER = "rishu834763/java-explainer-lora"
+# Enhanced 4-bit config with CPU offload enabled
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
+    llm_int8_enable_fp32_cpu_offload=True,  # ← Key fix: Allows CPU offload in 32-bit
 )
+print("Loading Mistral-7B-Instruct-v0.2 (4-bit with CPU offload) + your Java LoRA...")
+# Custom device_map: Prioritizes GPU, offloads to CPU as needed
+device_map = {
+    "model.embed_tokens": 0,  # GPU
+    "model.layers.0": 0,      # GPU for first layers...
+    "model.layers.1": 0,
+    "model.layers.2": 0,
+    "model.layers.3": 0,
+    "model.layers.4": 0,
+    "model.layers.5": 0,
+    "model.layers.6": 0,
+    "model.layers.7": 0,      # ~Halfway: Switch to CPU for rest
+    "model.layers.8": "cpu",
+    "model.layers.9": "cpu",
+    "model.layers.10": "cpu",
+    "model.layers.11": "cpu",
+    "model.layers.12": "cpu",
+    "model.layers.13": "cpu",
+    "model.layers.14": "cpu",
+    "model.layers.15": "cpu",
+    "model.layers.16": "cpu",
+    "model.layers.17": "cpu",
+    "model.layers.18": "cpu",
+    "model.layers.19": "cpu",
+    "model.layers.20": "cpu",
+    "model.layers.21": "cpu",
+    "model.layers.22": "cpu",
+    "model.layers.23": "cpu",
+    "model.layers.24": "cpu",
+    "model.layers.25": "cpu",
+    "model.layers.26": "cpu",
+    "model.layers.27": "cpu",
+    "model.layers.28": "cpu",
+    "model.layers.29": "cpu",
+    "model.layers.30": "cpu",
+    "model.layers.31": "cpu",
+    "model.norm": 0,          # GPU
+    "lm_head": 0,             # GPU
+}
+# Load base with fixes
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=quantization_config,
+    device_map=device_map,        # ← Custom map: GPU first, CPU fallback
     torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,       # ← Reduces loading RAM spike
     trust_remote_code=True,
 )
+# Apply your LoRA (lightweight, won't add much overhead)
 model = PeftModel.from_pretrained(base_model, LORA_ADAPTER)
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 tokenizer.pad_token = tokenizer.eos_token
+# Pipeline (optimized for mixed device)
 pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
+    max_new_tokens=512,           # Reduced for speed on low VRAM
     temperature=0.3,
     top_p=0.95,
     do_sample=True,
     return_full_text=False,
 )
+SYSTEM_PROMPT = "You are an expert Java teacher with 15+ years of experience. Always explain concepts clearly, include clean code examples, and use best practices."
 def chat(message: str, history):
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    output = pipe(prompt)[0]["generated_text"]
+    return output
+# Gradio UI (unchanged)
+with gr.Blocks(theme=gr.themes.Soft(), title="Java Explainer Pro") as demo:
+    gr.Markdown("# Java Explainer Pro\nFine-tuned on **rishu834763/java-explainer-lora** + **Mistral-7B-v0.2** (Low-VRAM Optimized)")
+    gr.Markdown("Ask anything about Java — from basics to Spring Boot, concurrency, JVM internals, and more!")
+    chatbot = gr.Chatbot(height=600)
+    msg = gr.Textbox(
+        placeholder="e.g. Explain CompletableFuture with a real-world example",
+        label="Your Java Question",
+        container=False,
+    )
     with gr.Row():
+        send = gr.Button("Send", variant="primary", scale=2)
+        clear = gr.Button("Clear Chat")
     send.click(chat, [msg, chatbot], [msg, chatbot]).then(lambda: "", outputs=msg)
     msg.submit(chat, [msg, chatbot], [msg, chatbot]).then(lambda: "", outputs=msg)
     clear.click(lambda: None, None, chatbot, queue=False)
+demo.queue(max_size=50).launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    share=True
+)