#!/usr/bin/env python3
"""
LEXPT Hugging Face Space - Professional Legal Analysis
Replicates the structure of professional HF Spaces with reasoning display
"""

import gradio as gr
import time
import torch
from typing import Generator

# Global model variables
model = None
tokenizer = None
model_loaded = False

# Model configuration - Using base model
BASE_MODEL_ID = "openai/gpt-oss-20b"  # 20B model
BASE_MODEL_ID = "openai/gpt-oss-20b"  # Using base model for now  # Your adapter (if available)

# Your legal system prompt
SYSTEM_PROMPT = """
ROLE
You are a U.S. legal analysis assistant focused on appellate and habeas issues. Your job is to produce precise, jurisdiction-aware answers drawn from the user's prompt and any text they embed (e.g., an opinion extract). You must not invent facts, quotations, or citations.

CORE DIRECTIVE — FINAL ANSWER ONLY
- Output ONLY the final answer to the user's prompt.
- Do NOT include prefaces, meta-commentary, chain-of-thought, or self-references.
- Do NOT restate the question, apologize, or add disclaimers.
- Do NOT add citations unless the prompt explicitly requests them.

SCOPE & SOURCES
- Default to the jurisdiction and stage implied by the prompt. If an opinion text is provided, treat it as the primary source of truth; do not add outside facts.
- If a request is impossible to answer from the provided materials, respond exactly: "Insufficient information."
- If the prompt asks for general doctrine (e.g., variance vs. constructive amendment, preservation standards, habeas default), state black-letter rules succinctly without citing unless requested.

FORMATTING & STYLE
- If the prompt says "list," return a numbered list starting at 1, with one item per line.
- If the prompt asks for a "checklist," use short bullet points; keep each bullet to one sentence.
- If the prompt asks for an "IRAC," use exactly these section headers in order, each on its own line: Issue; Rule; Application; Conclusion. No extra headings or text.
- If the prompt asks for an "argument for petitioner" or "argument for the state," produce 4–8 concise point-headings with brief supporting parentheticals or sub-bullets.
- If a word/line limit is specified, obey it strictly.
- Use party names and case captions exactly as given in the prompt.

CITATIONS (ONLY IF REQUESTED)
- When citations are explicitly requested, use Bluebook style:
  • First mention: full citation with court, year, and pincites if provided/clear.
  • Later mentions: short form with pincites.
  • For federal rules, cite rule and subdivision (e.g., Fed. R. Evid. 801(d)(2)(E)).
- If the prompt requests a "citation string," include the best supporting authorities in descending order of weight and relevance.

SUBSTANTIVE GUIDANCE (WHEN ASKED)
- Variance vs. constructive amendment: define both; explain that a variance is a proof–pleading discrepancy assessed for prejudice; a constructive amendment alters the charge's elements and is structural on direct review.
- Preservation/waiver: identify the contemporaneous objection rule, motion grounds specificity, and the effect of not requesting a continuance when surprised.
- Habeas procedural default: outline cause-and-prejudice (and actual-innocence gateway) if asked.
- Standards of review: identify the applicable standard (e.g., abuse of discretion, de novo, harmless-error) when requested and tie it to the posture.
- Evidence questions: if asked, cover authentication, hearsay/non-hearsay routes (including 801(d)(2)(E)), Rule 403, and the permissibility of juror aids like transcripts.

CONSTRAINTS
- Do not invent case names, record cites, or quotations.
- Do not introduce new facts not in the prompt's record.
- Keep tense and terminology consistent with the prompt (e.g., "appellant," "petitioner," "respondent," "state").
- Be concise and information-dense; avoid filler.

DEFAULT OUTPUT BEHAVIOR
- If the prompt is ambiguous but answerable, choose the most reasonable interpretation and answer directly without commentary.
- If the prompt requests formatting (e.g., numbered list, IRAC, checklist), follow it exactly.
"""

def load_model():
    """Load the LEXPT model (optimized for HF Spaces)"""
    global model, tokenizer, model_loaded
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
        from peft import PeftModel
        import torch
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Check if GPU is available for quantization
        if torch.cuda.is_available():
            # GPU available - use quantization
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            )
            
            model = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL_ID,
                quantization_config=bnb_config,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                trust_remote_code=True,
            )
        else:
            # CPU fallback - no quantization
            model = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL_ID,
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True,
                low_cpu_mem_usage=True,
            )
        
        # Load your adapter with fallback
        if ADAPTER_ID:
            try:
                model = PeftModel.from_pretrained(model, ADAPTER_ID)
                adapter_loaded = True
                print(f"✅ Loaded adapter: {ADAPTER_ID}")
            except Exception as adapter_error:
                print(f"⚠️ Adapter loading failed: {adapter_error}")
                print("📝 Using base model without adapter")
                adapter_loaded = False
        else:
            adapter_loaded = False
            
        model.eval()
        
        model_loaded = True
        adapter_status = f" + {ADAPTER_ID}" if adapter_loaded else " (base model only)"
        return f"✅ LEXPT Model loaded successfully!{adapter_status}"
        
    except Exception as e:
        print(f"❌ Model loading error: {str(e)}")
        return f"❌ Error loading model: {str(e)}\n\n💡 Try upgrading to T4 Small GPU tier in Space settings"

def generate_reasoning_steps(query: str) -> Generator[str, None, None]:
    """Generate reasoning steps (like the HF Space demo)"""
    yield "🤔 Analyzing the legal query..."
    time.sleep(0.5)
    
    yield "📚 Reviewing relevant legal doctrines and precedents..."
    time.sleep(0.5)
    
    yield "⚖️ Applying legal principles to the specific facts..."
    time.sleep(0.5)
    
    yield "✍️ Structuring the response according to legal formatting requirements..."
    time.sleep(0.5)
    
    yield "🎯 Finalizing analysis with precise legal terminology..."

def generate_response(query: str, show_reasoning: bool = False):
    """Generate legal analysis with optional reasoning display"""
    global model, tokenizer, model_loaded
    
    if not model_loaded:
        return "❌ Model not loaded. Please wait for initialization.", ""
    
    if not query.strip():
        return "Please enter a legal query.", ""
    
    reasoning_text = ""
    
    # Show reasoning steps if requested
    if show_reasoning:
        reasoning_steps = []
        for step in generate_reasoning_steps(query):
            reasoning_steps.append(step)
        reasoning_text = "\n".join(reasoning_steps)
    
    try:
        # Build messages
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": query}
        ]
        
        # Tokenize
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt"
        )
        
        if isinstance(inputs, dict):
            inputs = inputs.to(model.device)
        else:
            inputs = {"input_ids": inputs.to(model.device)}
        
        start_time = time.time()
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1200,
                temperature=0.2,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
            )
        
        latency = time.time() - start_time
        
        # Decode response
        prompt_len = inputs["input_ids"].shape[-1]
        response = tokenizer.decode(outputs[0][prompt_len:], skip_special_tokens=True).strip()
        
        # Format final output
        final_output = f"""## ⚖️ Legal Analysis

**Query:** {query}

        **Response Time:** {latency:.2f}s  
        **Model:** {BASE_MODEL_ID}{f" + {ADAPTER_ID}" if ADAPTER_ID else " (base model)"}

---

{response}

---
*Generated by LEXPT - Legal Analysis AI*
"""
        
        return final_output, reasoning_text
        
    except Exception as e:
        return f"❌ Generation error: {str(e)}", reasoning_text

# Example legal queries
EXAMPLE_QUERIES = [
    "Draft 5 advocacy point-headings for petitioner that a knife→gun variance violated Sixth-Amendment notice",
    "Explain the difference between a 'variance' and a 'constructive amendment' of the charging instrument",
    "Analyze prejudice under the variance doctrine: Did the proof at trial (gun vs. knife) mislead the defense?",
    "Write a crisp one-page IRAC on Ridgeway: Issue (variance/notice), Rule, Application, Conclusion",
    "Create a checklist of record cites you would pull to brief this issue"
]

# Auto-load model on startup (for HF Spaces)
print("🚀 Auto-loading LEXPT model...")
model_status = "Loading..."
try:
    for status_update in load_model():
        model_status = status_update
        print(status_update)
except Exception as e:
    model_status = f"❌ Auto-load failed: {str(e)}"
    print(model_status)

# Create Gradio interface (replicating professional HF Space design)
with gr.Blocks(
    title="LEXPT - Legal Analysis AI", 
    theme=gr.themes.Soft(),
    css="""
    .main-header { text-align: center; margin-bottom: 2rem; }
    .reasoning-box { background: #f8f9fa; padding: 1rem; border-radius: 8px; margin: 1rem 0; }
    .status-box { background: #e8f5e8; padding: 0.5rem; border-radius: 4px; }
    """
) as demo:
    
    gr.Markdown("""
    # ⚖️ LEXPT - Legal Analysis AI
    
    **Professional legal analysis powered by fine-tuned GPT-OSS-20B**  
    Specialized for appellate and habeas corpus issues
    
    *Give it a couple of seconds to start. You can enable reasoning level to see the thinking process.*
    """, elem_classes="main-header")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(f"""
            <div class="status-box">
            <strong>Model Status:</strong> {model_status}
            </div>
            """)
    
    with gr.Row():
        with gr.Column(scale=3):
            query_input = gr.Textbox(
                label="Legal Query",
                placeholder="Enter your legal analysis request...",
                lines=4
            )
            
            with gr.Row():
                submit_btn = gr.Button("⚖️ Analyze", variant="primary")
                reasoning_checkbox = gr.Checkbox(
                    label="Show Reasoning Process", 
                    value=False
                )
        
        with gr.Column(scale=1):
            gr.Markdown("### 📋 Example Queries")
            gr.Examples(
                examples=EXAMPLE_QUERIES,
                inputs=query_input,
                label=""
            )
    
    # Output sections
    output_response = gr.Markdown(label="Analysis")
    
    # Reasoning section (collapsible like the HF Space demo)
    with gr.Accordion("🤔 Click to view Thinking Process", open=False) as reasoning_accordion:
        output_reasoning = gr.Markdown("Reasoning steps will appear here when enabled...")
    
    # Handle submission
    submit_btn.click(
        fn=generate_response,
        inputs=[query_input, reasoning_checkbox],
        outputs=[output_response, output_reasoning]
    )
    
    gr.Markdown("""
    ---
    ### 🔧 Technical Details
    
    - **Base Model:** OpenAI GPT-OSS-20B (20 billion parameters)
    - **Fine-tuning:** PEFT adapter trained on legal analysis tasks
    - **Specialization:** Appellate law, habeas corpus, constitutional issues
    - **Optimization:** 4-bit quantization for efficient inference
    
    *This Space demonstrates professional legal AI deployment on Hugging Face infrastructure.*
    """)

if __name__ == "__main__":
    # Enable API for external webpage access
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_api=True  # This exposes the API for your webpage
    )