from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
import tempfile, os, json, shutil, math, re

from extractor import PDFExtractor
from agree_calculator import AGREECalculator
from nqs_calculator import NQSCalculator
from complexmogapi_calculator import ComplexMoGAPICalculator, get_param_definitions
from eco_scale_calculator import EcoScaleCalculator
from cafri_calculator import CaFRICalculator

app = FastAPI(title="AGREE API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=False,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Temporary storage for session files
UPLOAD_DIR = tempfile.mkdtemp(prefix="agree_")


# ─────────────────────────────────────────────────────────────
# Academic PDF boilerplate patterns — compiled once at startup.
# Used by clean_paper_text() to strip noise before LLM calls.
_BOILERPLATE_PATTERNS = [
    # ─ Page numbers (standalone or decorated) ─────────────────────────
    re.compile(r'^\s*\d{1,4}\s*$', re.MULTILINE),                      # bare page numbers
    re.compile(r'^\s*[-–—]\s*\d{1,4}\s*[-–—]\s*$', re.MULTILINE),    # – 12 –
    re.compile(r'^\s*Page\s+\d+\s*(of\s+\d+)?\s*$', re.MULTILINE | re.IGNORECASE),

    # ─ DOI / URL lines ─────────────────────────────────────
    re.compile(r'^\s*https?://\S+\s*$', re.MULTILINE),
    re.compile(r'^\s*doi\s*:\s*\S+\s*$', re.MULTILINE | re.IGNORECASE),
    re.compile(r'^\s*DOI:\s*10\.\d{4,}/\S+\s*$', re.MULTILINE | re.IGNORECASE),

    # ─ Journal / volume / issue / received-accepted metadata ────────
    re.compile(
        r'^.{0,120}(Volume|Vol\.?|Issue|No\.?|pp\.?|Pages?)\s+\d.{0,60}$',
        re.MULTILINE | re.IGNORECASE
    ),
    re.compile(
        r'^.{0,120}(Received|Accepted|Revised|Published|Available online).{0,80}$',
        re.MULTILINE | re.IGNORECASE
    ),
    re.compile(
        r'^.{0,120}(Journal of|Analytica|Talanta|Analyst|Chemosphere|Chromatogr|Spectrochim|Microchem|TrAC|Molecules|Int\.? J\.?).{0,120}$',
        re.MULTILINE | re.IGNORECASE
    ),

    # ─ Copyright / licence / publisher notices ───────────────────
    re.compile(
        r'^.{0,200}(\u00a9|Copyright|All rights reserved|Elsevier|Springer|Wiley|ACS Publications|Royal Society|Taylor & Francis|MDPI|BMC|Creative Commons|CC BY).{0,200}$',
        re.MULTILINE | re.IGNORECASE
    ),

    # ─ Author affiliation blocks (lines starting with superscript-like patterns) ─
    re.compile(r'^\s*[1-9a-z,;\*\u2020\u2021\u00a7]{1,4}\s+(Department|School|Faculty|Institute|University|College|Laboratory|Center|Centre|Division).{0,200}$',
               re.MULTILINE | re.IGNORECASE),
    re.compile(r'^\s*(E-?mail|Correspondence|\*\s*Corresponding author).{0,200}$',
               re.MULTILINE | re.IGNORECASE),

    # ─ ORCID / CRediT / funding statement lines ─────────────────
    re.compile(r'^.{0,200}(ORCID|CRediT|Author contribution|Funding|Acknowledgem|Declaration of competing interest|Conflict of interest).{0,200}$',
               re.MULTILINE | re.IGNORECASE),
    re.compile(r'^\s*\d{4}-\d{4}-\d{4}-\d{4}\s*$', re.MULTILINE),   # raw ORCID numbers

    # ─ Running headers / footers (short lines repeated near page boundaries) ─
    # Catch lines that are pure UPPERCASE and very short (typical running headers)
    re.compile(r'^\s*[A-Z][A-Z \-&:]{5,60}\s*$', re.MULTILINE),

    # ─ Reference list entries (numbered citations at end of paper) ─────
    # Lines starting with [n] or n. followed by author initials pattern
    re.compile(r'^\s*\[\d{1,3}\]\s+[A-Z][a-z]*.{0,300}$', re.MULTILINE),
    re.compile(r'^\s*\d{1,3}\.\s+[A-Z][a-z]{0,20},\s+[A-Z]\.?.{0,300}$', re.MULTILINE),
]


def clean_paper_text(text: str) -> str:
    """
    Strip common academic PDF boilerplate from extracted text before sending
    to an LLM. Reduces token count without losing experimental/methods content.
    Returns cleaned text with normalised whitespace.
    """
    # Step 1: Apply all regex patterns
    for pat in _BOILERPLATE_PATTERNS:
        text = pat.sub('', text)

    # Step 2: Collapse runs of blank lines to a single blank line
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Step 3: Drop lines that are too short to carry scientific meaning
    # (e.g. stray page-header fragments, bullet symbols, lone punctuation)
    lines = text.split('\n')
    kept = []
    for line in lines:
        stripped = line.strip()
        # Keep blank lines (paragraph separators)
        if not stripped:
            kept.append('')
            continue
        # Drop very short lines that aren\'t list items or section headings
        if len(stripped) < 8 and not re.match(r'^(\d+\.?|[-•–])', stripped):
            continue
        kept.append(line)
    text = '\n'.join(kept)

    # Step 4: Final whitespace normalisation
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

# ─────────────────────────────────────────────────────────────
# STEP 1: Upload PDF & extract raw text
# ─────────────────────────────────────────────────────────────
@app.post("/step1/upload-pdf")
async def step1_upload_pdf(file: UploadFile = File(...)):
    """Upload a PDF, return the saved file id and extracted plain text."""
    # Save permanently for this session
    session_id = os.urandom(8).hex()
    session_dir = os.path.join(UPLOAD_DIR, session_id)
    os.makedirs(session_dir, exist_ok=True)

    pdf_path = os.path.join(session_dir, "document.pdf")
    content = await file.read()
    with open(pdf_path, "wb") as f:
        f.write(content)

    # Extract text using PyPDF2
    import PyPDF2
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page_num in range(len(reader.pages)):
                page_text = reader.pages[page_num].extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"PDF text extraction failed: {str(e)}")

    # Save extracted text
    text_path = os.path.join(session_dir, "extracted.txt")
    with open(text_path, "w", encoding="utf-8") as f:
        f.write(text)

    return {
        "session_id": session_id,
        "filename": file.filename,
        "page_count": len(PyPDF2.PdfReader(open(pdf_path, "rb")).pages),
        "char_count": len(text),
        "text": text
    }


# ─────────────────────────────────────────────────────────────
# STEP 1b: Serve the stored PDF for the viewer
# ─────────────────────────────────────────────────────────────
@app.get("/step1/pdf/{session_id}")
async def serve_pdf(session_id: str):
    pdf_path = os.path.join(UPLOAD_DIR, session_id, "document.pdf")
    if not os.path.exists(pdf_path):
        raise HTTPException(status_code=404, detail="PDF not found")
    return FileResponse(pdf_path, media_type="application/pdf")


# ─────────────────────────────────────────────────────────────
# STEP 2: Run LLM analysis on extracted text
# ─────────────────────────────────────────────────────────────
@app.post("/step2/llm-analyze")
async def step2_llm_analyze(
    session_id: str = Form(...),
    api_key: str = Form(...),
    provider: str = Form("gemini"),
    analysis_type: str = Form("agree"),
    target_technique: str = Form("")
):
    """Run LLM to extract parameters + per-parameter evidence from stored text.
    Returns extracted_data + evidence dict (with quote + reasoning per param)."""
    text_path = os.path.join(UPLOAD_DIR, session_id, "extracted.txt")
    if not os.path.exists(text_path):
        raise HTTPException(status_code=404, detail="Extracted text not found. Run Step 1 first.")

    with open(text_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Strip boilerplate (page numbers, headers, author info, references, etc.)
    # before sending to the LLM to reduce token load.
    cleaned_text = clean_paper_text(text)

    try:
        extractor = PDFExtractor(provider=provider, api_key=api_key)
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Failed to init LLM provider '{provider}': {str(e)}")

    # Validate API key before calling provider
    if not api_key.strip() and provider not in ("lmstudio", "ollama", "local"):
        raise HTTPException(status_code=401, detail="API key is missing. Please paste your API key in the sidebar and try again.")

    try:
        raw = extractor.analyze_document(cleaned_text, analysis_type=analysis_type, target_technique=target_technique)
    except Exception as e:
        err_str = str(e)
        # Detect 401 / auth errors from any provider and surface as HTTP 401
        if '401' in err_str or 'authentication' in err_str.lower() or 'missing authentication' in err_str.lower() or 'invalid api key' in err_str.lower() or 'unauthorized' in err_str.lower():
            raise HTTPException(status_code=401, detail=f"Invalid or missing API key for provider '{provider}'. Please check your key and try again. (Details: {err_str[:200]})")
        # Detect 429 rate-limit errors and surface as HTTP 429
        if '429' in err_str or 'rate limit' in err_str.lower() or 'rate-limit' in err_str.lower() or 'too many requests' in err_str.lower():
            raise HTTPException(status_code=429, detail=f"Rate limit reached for provider '{provider}'. Please wait a moment and try again, or switch to a different provider. (Details: {err_str[:300]})")
        raise HTTPException(status_code=500, detail=f"LLM analysis failed: {err_str}")

    # Separate _evidence from the main extracted_data
    evidence = raw.pop("_evidence", {})

    # Build highlights dict compatible with frontend: {param_key: ["quote — reasoning"]}
    highlights: dict = {}
    if isinstance(evidence, dict):
        for param, ev in evidence.items():
            if isinstance(ev, dict):
                quote = ev.get("quote", "").strip()
                reasoning = ev.get("reasoning", "").strip()
                parts = []
                if quote:
                    parts.append(f'📖 "{quote}"')
                if reasoning:
                    parts.append(f'💡 {reasoning}')
                if parts:
                    highlights[param] = parts
            elif isinstance(ev, str) and ev.strip():
                highlights[param] = [ev.strip()]

    # Also run legacy text search for AGREE if evidence is sparse
    if analysis_type == "agree" and len(highlights) < 5:
        legacy = find_highlights(text, raw)
        for k, v in legacy.items():
            if k not in highlights:
                highlights[k] = v

    # Save results
    result_path = os.path.join(UPLOAD_DIR, session_id, "extracted_data.json")
    with open(result_path, "w", encoding="utf-8") as f:
        json.dump(raw, f, indent=2)

    return {
        "extracted_data": raw,
        "highlights": highlights
    }


def find_highlights(text: str, data: dict) -> dict:
    """Search for text snippets relevant to each extracted parameter."""
    highlights = {}
    search_terms = {
        "p1_option": [data.get("p1_option", ""), "sample", "treatment", "sampling", "remote sensing", "in-field"],
        "p2_amount": ["sample amount", "sample weight", "sample volume", str(data.get("p2_amount", "")), "g ", "mL", "mg"],
        "p3_option": [data.get("p3_option", ""), "in-line", "on-line", "at-line", "off-line", "inline", "online"],
        "p4_steps": ["steps", "procedure", "filtration", "extraction", "centrifugation", "evaporation", "derivatization"],
        "p5_automation": [data.get("p5_automation", ""), "automatic", "semi-automatic", "manual", "miniatur"],
        "p6_has_derivatization": ["derivat", "reagent", "CAS"],
        "p7_waste_amount": ["waste", "solvent", "reagent volume", str(data.get("p7_waste_amount", ""))],
        "p8_analytes_per_run": ["analyte", "parameter", "compound", "simultaneous", "per hour", "throughput"],
        "p9_technique": [data.get("p9_technique", ""), "HPLC", "GC", "LC-MS", "FTIR", "UV", "NMR", "SPE"],
        "p10_biobased_status": ["bio-based", "renewable", "bioethanol", "green solvent", data.get("p10_biobased_status", "")],
        "p11_has_toxic_reagents": ["toxic", "hazardous", "acetonitrile", "methanol", "chloroform", "benzene"],
        "p12_threats_count": ["flammable", "corrosive", "explosive", "oxidis", "aquatic", "bioaccumul", "persistent"],
    }

    lines = text.split("\n")
    for param, terms in search_terms.items():
        found = []
        for line in lines:
            if len(line.strip()) < 5:
                continue
            for term in terms:
                if term and len(str(term)) > 2 and str(term).lower() in line.lower():
                    snippet = line.strip()[:200]
                    if snippet and snippet not in found:
                        found.append(snippet)
                    break
        highlights[param] = found[:3]  # Max 3 snippets per param

    return highlights


# ─────────────────────────────────────────────────────────────
# STEP 3: Calculate the 12 principles
# ─────────────────────────────────────────────────────────────
@app.post("/step3/calculate")
async def step3_calculate(
    extracted_data: str = Form(...)
):
    """Compute greenness scores for the 12 AGREE principles from extracted_data JSON."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    calculator = AGREECalculator()
    scores = calculator.calculate_all(data)

    # Build detailed breakdown for each principle
    breakdown = build_breakdown(data, scores)

    return {
        "scores": scores,
        "breakdown": breakdown
    }

@app.post("/step3/calculate-nqs")
async def step3_calculate_nqs(
    extracted_data: str = Form(...)
):
    """Compute greenness scores for the NQS metric from extracted_data JSON."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    # Extract inputs safely
    need_tier = int(data.get("need_tier", 1))
    
    rgb_data = {}
    for letter in ['r', 'g', 'b']:
        for i in range(1, 5):
            rgb_data[f"{letter}{i}"] = float(data.get(f"{letter}{i}", 0.0))
            
    sdg_agreements = []
    for i in range(1, 18):
        sdg_agreements.append(bool(data.get(f"sdg_{i}", False)))
        
    calculator = NQSCalculator()
    results = calculator.calculate_nqs(need_tier, rgb_data, sdg_agreements)
    
    return results


@app.post("/step3/calculate-bagi")
async def step3_calculate_bagi(
    extracted_data: str = Form(...)
):
    """Compute greenness scores for the BAGI metric from extracted_data JSON (selections dict {p1: 0, p2: 1, ...})."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    scores = {}
    total = 0.0
    for i in range(1, 11):
        idx = int(data.get(f"p{i}", 3)) # Default to worst (2.5) if not set
        score = 10.0 - (idx * 2.5)
        scores[f"p{i}"] = score
        total += score
    
    scores["Total"] = total
    return {"scores": scores}


@app.post("/step3/calculate-rapi")
async def step3_calculate_rapi(
    extracted_data: str = Form(...)
):
    """Compute performance scores for RAPI from extracted_data JSON (selections dict {p1: 0, p2: 1, ...})."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    scores = {}
    total = 0.0
    for i in range(1, 11):
        # The frontend will send the exact point value (0, 2.5, 5, 7.5, 10) for RAPI
        score = float(data.get(f"p{i}", 0.0))
        scores[f"p{i}"] = score
        total += score
    
    scores["Total"] = total
    return {"scores": scores}


@app.post("/step3/calculate-complexmogapi")
async def step3_calculate_complexmogapi(
    extracted_data: str = Form(...)
):
    """Compute TGS for ComplexMoGAPI from extracted_data JSON (selections dict {param_id: option_index})."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    # Ensure all values are ints for parameter selections only
    selections = {k: int(v) for k, v in data.items()
                  if k not in ('has_quantification', 'e_factor', '_tgs_preview')
                  and isinstance(v, (int, float, str))
                  and str(v).lstrip('-').isdigit()}
    has_quantification = bool(data.get('has_quantification', True))
    e_factor = float(data.get('e_factor', 0.0))

    calculator = ComplexMoGAPICalculator()
    results = calculator.calculate(selections, has_quantification=has_quantification, e_factor=e_factor)

    return results


@app.post("/step3/calculate-ecoscale")
async def step3_calculate_ecoscale(
    extracted_data: str = Form(...)
):
    """Compute Analytical Eco-Scale from extracted_data JSON."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    calculator = EcoScaleCalculator()
    results = calculator.calculate(data)
    
    return results


@app.post("/step3/calculate-cafri")
async def step3_calculate_cafri(
    extracted_data: str = Form(...)
):
    """Compute Carbon Footprint Reduction Index from JSON."""
    try:
        data = json.loads(extracted_data)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid JSON for extracted_data")

    calculator = CaFRICalculator()
    results = calculator.calculate(data)
    
    return results

@app.get("/complexmogapi/params")
async def get_complexmogapi_params():
    """Returns the full parameter definitions for the frontend to render the review form."""
    return {"params": get_param_definitions()}


PRINCIPLE_INFO = {
    1: {"name": "Sample Treatment", "goal": "Avoid sample treatment", "icon": "🔬"},
    2: {"name": "Sample Amount", "goal": "Minimize sample size", "icon": "⚖️"},
    3: {"name": "Device Positioning", "goal": "Perform in situ measurements", "icon": "📡"},
    4: {"name": "Integration of Steps", "goal": "Reduce distinct procedural steps", "icon": "🔗"},
    5: {"name": "Automation & Miniaturization", "goal": "Select automated & miniaturized methods", "icon": "🤖"},
    6: {"name": "Derivatization", "goal": "Avoid chemical modification", "icon": "⚗️"},
    7: {"name": "Analytical Waste", "goal": "Prevent large volumes of waste", "icon": "♻️"},
    8: {"name": "Throughput", "goal": "Multi-analyte methods are preferred", "icon": "⚡"},
    9: {"name": "Energy Consumption", "goal": "Minimize energy use", "icon": "🔋"},
    10: {"name": "Source of Reagents", "goal": "Use bio-based/renewable reagents", "icon": "🌿"},
    11: {"name": "Toxicity", "goal": "Eliminate toxic reagents", "icon": "☠️"},
    12: {"name": "Operator Safety", "goal": "Increase safety", "icon": "🛡️"},
}

def build_breakdown(data: dict, scores: dict) -> list:
    breakdown = []
    # P1
    breakdown.append({
        "principle": 1,
        **PRINCIPLE_INFO[1],
        "input": f"Selected: {data.get('p1_option', 'N/A')}",
        "formula": "Direct lookup table",
        "score": scores.get(1, 0)
    })
    # P2
    amt2 = data.get("p2_amount", 0)
    if amt2 < 0.1:
        formula2 = "amount < 0.1 → Score = 1.00"
    elif amt2 > 100:
        formula2 = "amount > 100 → Score = 0.00"
    else:
        formula2 = f"Score = |−0.142 × ln({amt2}) + 0.65| = {scores.get(2, 0):.3f}"
    breakdown.append({"principle": 2, **PRINCIPLE_INFO[2], "input": f"Sample amount: {amt2} g/mL", "formula": formula2, "score": scores.get(2, 0)})
    # P3
    breakdown.append({"principle": 3, **PRINCIPLE_INFO[3], "input": f"Positioning: {data.get('p3_option', 'N/A')}", "formula": "Direct lookup table", "score": scores.get(3, 0)})
    # P4
    steps4 = data.get("p4_steps", 0)
    breakdown.append({"principle": 4, **PRINCIPLE_INFO[4], "input": f"Steps: {steps4}", "formula": f"{steps4} steps → Score = {scores.get(4, 0):.1f}", "score": scores.get(4, 0)})
    # P5
    auto5 = data.get("p5_automation", "manual"); mini5 = data.get("p5_miniaturized", False)
    breakdown.append({"principle": 5, **PRINCIPLE_INFO[5], "input": f"Automation: {auto5}, Miniaturized: {mini5}", "formula": "2D matrix lookup", "score": scores.get(5, 0)})
    # P6
    deriv6 = data.get("p6_has_derivatization", False); cas6 = data.get("p6_agents_cas", [])
    if not deriv6:
        formula6 = "No derivatization → Score = 1.0"
    else:
        formula6 = f"Score = (∏ DA_i) − 0.2  [agents: {', '.join(cas6) if cas6 else 'none listed'}]"
    breakdown.append({"principle": 6, **PRINCIPLE_INFO[6], "input": f"Derivatization: {deriv6}, CAS agents: {cas6}", "formula": formula6, "score": scores.get(6, 0)})
    # P7
    amt7 = data.get("p7_waste_amount", 0)
    if amt7 < 0.1: formula7 = "amount < 0.1 → Score = 1.00"
    elif amt7 > 150: formula7 = "amount > 150 → Score = 0.00"
    else: formula7 = f"Score = |−0.134 × ln({amt7}) + 0.6946| = {scores.get(7, 0):.3f}"
    breakdown.append({"principle": 7, **PRINCIPLE_INFO[7], "input": f"Waste: {amt7} g/mL", "formula": formula7, "score": scores.get(7, 0)})
    # P8
    a8 = data.get("p8_analytes_per_run", 0); r8 = data.get("p8_runs_per_hour", 0)
    total8 = a8 * r8
    if total8 > 70: formula8 = f"{a8} × {r8} = {total8} > 70 → Score = 1.00"
    elif total8 < 1: formula8 = f"{a8} × {r8} = {total8} < 1 → Score = 0.00"
    else: formula8 = f"Score = |0.2429 × ln({total8:.2f}) − 0.0517| = {scores.get(8, 0):.3f}"
    breakdown.append({"principle": 8, **PRINCIPLE_INFO[8], "input": f"Analytes/run: {a8}, Runs/hr: {r8}", "formula": formula8, "score": scores.get(8, 0)})
    # P9
    breakdown.append({"principle": 9, **PRINCIPLE_INFO[9], "input": f"Technique: {data.get('p9_technique', 'N/A')}", "formula": "Lookup in high/medium/low technique list", "score": scores.get(9, 0)})
    # P10
    breakdown.append({"principle": 10, **PRINCIPLE_INFO[10], "input": f"Reagent source: {data.get('p10_biobased_status', 'N/A')}", "formula": "No reagents/All bio-based=1.0, Partial=0.5, None=0.0", "score": scores.get(10, 0)})
    # P11
    toxic11 = data.get("p11_has_toxic_reagents", False); amt11 = data.get("p11_toxic_amount", 0)
    if not toxic11: formula11 = "No toxic reagents → Score = 1.0"
    elif amt11 < 0.1: formula11 = "amount < 0.1 g/mL → Score = 0.8"
    elif amt11 > 50: formula11 = "amount > 50 → Score = 0.00"
    else: formula11 = f"Score = |−0.129 × ln({amt11}) + 0.5012| = {scores.get(11, 0):.3f}"
    breakdown.append({"principle": 11, **PRINCIPLE_INFO[11], "input": f"Toxic reagents: {toxic11}, Amount: {amt11} g/mL", "formula": formula11, "score": scores.get(11, 0)})
    # P12
    threats12 = data.get("p12_threats_count", 0)
    breakdown.append({"principle": 12, **PRINCIPLE_INFO[12], "input": f"GHS threats count: {threats12}", "formula": f"{threats12} threats → Score = {scores.get(12, 0):.1f}", "score": scores.get(12, 0)})
    return breakdown


# ─────────────────────────────────────────────────────────────
# STEP 4: Generate AI discussion section
# ─────────────────────────────────────────────────────────────
PRINCIPLE_NAMES_FULL = {
    1:  "Sample Treatment",
    2:  "Sample Amount",
    3:  "Device Positioning",
    4:  "Integration of Steps",
    5:  "Automation and Miniaturization",
    6:  "Derivatization",
    7:  "Analytical Waste",
    8:  "Throughput",
    9:  "Energy Consumption",
    10: "Source of Reagents",
    11: "Toxicity of Reagents",
    12: "Operator's Safety",
}

def _build_discussion_prompt(scores: dict, data: dict, model_name: str, analysis_type: str = "agree") -> str:
    method_info = {
        "agree": ("AGREE (Analytical GREEnness) metric", "Pena-Pereira F, Wojnowski W, Tobiszewski M. AGREE—Analytical GREEnness Metric Approach and Software. Anal Chem. 2020;92(14):10076–10082"),
        "nqs": ("NQS (Need, Quality, and Sustainability) index", "Kiwfo K, et al. A new need, quality, and sustainability (NQS) index. Microchem J. 2023;193:109026"),
        "complexmogapi": ("ComplexMoGAPI", "Mansour FR, et al. A total scoring system and software for complex modified GAPI. Green Anal Chem. 2024;10:100126"),
        "ecoscale": ("Analytical Eco-Scale", "Gałuszka A, et al. Analytical Eco-Scale for assessing the greenness of analytical procedures. TrAC. 2012;37:61-72"),
        "cafri": ("CaFRI (Carbon Footprint Reduction Index)", "Mansour FR, Nowak PM. Introducing the carbon footprint reduction index (CaFRI). BMC Chemistry. 2025;19:10"),
        "bagi": ("BAGI (Blue Applicability Grade Index)", "Astolfi ML, et al. Blue Applicability Grade Index (BAGI) and tool: A new metric for evaluating the operational practicality of analytical methods. Green Chemistry, 2023")
    }
    method_name, reference = method_info.get(analysis_type, method_info["agree"])

    return f"""You are an expert in Green Analytical Chemistry. Write a formal, academic-quality DISCUSSION SECTION suitable for publication in a peer-reviewed analytical chemistry journal. The discussion is about the greenness assessment of an analytical method evaluated using the {method_name}.

Context:
- AI model used for extraction: {model_name}
- OVERALL SCORES & VERDICT:
{json.dumps(scores, indent=2)}

- EXTRACTED METHOD PARAMETERS:
{json.dumps(data, indent=2)}

WRITING REQUIREMENTS:
1. Write EXACTLY 4–5 paragraphs.
2. Paragraph 1: Introduce the {method_name} briefly (cite: {reference}). State the overall score and classification/verdict of the method.
3. Paragraph 2: Discuss the BEST-performing aspects of the procedure — what the method does well from a greenness perspective, referencing specific extracted parameters (e.g., low solvent use, energy efficiency, non-toxic reagents).
4. Paragraph 3: Discuss the WORST-performing aspects (penalties or low scores) — describe what these reveal about the method's environmental/safety weaknesses.
5. Paragraph 4: Discuss the overall greenness profile and general applicability.
6. Paragraph 5 (optional): Give 2–3 concrete, specific, actionable recommendations to improve the method's greenness score according to the {method_name}.

STYLE:
- Third person, passive voice, formal academic English.
- Be highly specific, citing the exact numeric scores and extracted data values provided above.
- Do NOT use markdown, bullet points, or headers inside the text. Write flowing paragraphs only.
- Length: 350–550 words total.

Produce ONLY the discussion text, starting directly with the first paragraph.
"""

@app.post("/step4/generate-discussion")
async def generate_discussion(
    provider: str = Form("gemini"),
    api_key: str = Form(...),
    scores_json: str = Form(...),
    extracted_data_json: str = Form(...),
    analysis_type: str = Form("agree")
):
    """Use the same LLM provider to generate a scientific discussion section."""
    try:
        scores = json.loads(scores_json)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid scores JSON")
    try:
        extracted_data = json.loads(extracted_data_json)
    except Exception:
        raise HTTPException(status_code=400, detail="Invalid extracted_data JSON")

    try:
        extractor = PDFExtractor(provider=provider, api_key=api_key)
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Failed to init provider '{provider}': {str(e)}")

    # Identify the model name for the prompt
    model_map = {
        "gemini":   "Gemini 2.0 Flash (Google)",
        "groq":     "Llama 3.3 70B (Groq)",
        "openai":   "GPT-4o mini (OpenAI)",
        "lmstudio": "Local LLM (LM Studio)",
        "ollama":   "Local LLM (Ollama)",
        "local":    "Local LLM",
    }
    model_name = model_map.get(provider.lower(), f"Local LLM ({provider})")

    prompt = _build_discussion_prompt(scores, extracted_data, model_name, analysis_type=analysis_type)

    try:
        if provider.lower() == "gemini":
            response = extractor.model.generate_content(prompt)
            discussion = response.text.strip()
        elif provider.lower() == "groq":
            response = extractor.client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[
                    {"role": "system", "content": "You are an expert academic writer in analytical chemistry."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.6,
                max_tokens=1200,
            )
            discussion = response.choices[0].message.content.strip()
        elif provider.lower() == "openai":
            response = extractor.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert academic writer in analytical chemistry."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.6,
                max_tokens=1200,
            )
            discussion = response.choices[0].message.content.strip()
        elif provider.lower() in ("lmstudio", "ollama", "local"):
            response = extractor.client.chat.completions.create(
                model=extractor.model_name,
                messages=[
                    {"role": "system", "content": "You are an expert academic writer in analytical chemistry."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.6,
                max_tokens=1200,
            )
            discussion = response.choices[0].message.content.strip()
        else:
            raise HTTPException(status_code=400, detail=f"Unknown provider: {provider}")

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Discussion generation failed: {str(e)}")

    return {
        "discussion": discussion,
        "model": model_name,
        "provider": provider,
        "total_score": scores.get("Total", 0.0),
    }