"""
LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).

Takes raw web content or uploaded text and extracts structured polymer
datasheet properties.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Optional

from huggingface_hub import InferenceClient

import config
from models import DatasheetRecord

logger = logging.getLogger(__name__)

# ── System prompt for structured extraction ──────────────────────────────────

SYSTEM_PROMPT = """\
You are an expert polymer materials scientist and data extraction specialist.
Your task is to extract technical datasheet properties from the provided raw text
and return them as a JSON object.

RULES:
1. Extract ONLY information explicitly stated in the source text.
2. If a property is not found, leave the value as an empty string "".
3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³").
4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C").
5. If multiple grades/variants exist, pick the one that best matches the query.
6. Return ONLY valid JSON — no markdown, no extra text, no code blocks.

Return a JSON object with exactly these keys:

{
  "material_name": "",
  "trade_name": "",
  "manufacturer": "",
  "polymer_family": "",
  "grade": "",
  "description": "",
  "processing_method": "",
  "features": "",
  "applications": "",
  "tensile_strength_mpa": "",
  "tensile_modulus_mpa": "",
  "elongation_at_break_pct": "",
  "flexural_strength_mpa": "",
  "flexural_modulus_mpa": "",
  "impact_strength_charpy_kj_m2": "",
  "impact_strength_izod_j_m": "",
  "hardness_shore_d": "",
  "hardness_rockwell": "",
  "compressive_strength_mpa": "",
  "melting_temperature_c": "",
  "glass_transition_temperature_c": "",
  "heat_deflection_temperature_c": "",
  "vicat_softening_temperature_c": "",
  "continuous_service_temperature_c": "",
  "thermal_conductivity_w_mk": "",
  "coefficient_of_thermal_expansion_um_mk": "",
  "flammability_rating": "",
  "density_g_cm3": "",
  "melt_flow_index_g_10min": "",
  "water_absorption_pct": "",
  "moisture_absorption_pct": "",
  "specific_gravity": "",
  "transparency": "",
  "color": "",
  "dielectric_strength_kv_mm": "",
  "dielectric_constant": "",
  "volume_resistivity_ohm_cm": "",
  "surface_resistivity_ohm": "",
  "dissipation_factor": "",
  "acid_resistance": "",
  "alkali_resistance": "",
  "solvent_resistance": "",
  "uv_resistance": "",
  "weatherability": "",
  "fda_approved": "",
  "rohs_compliant": "",
  "reach_compliant": "",
  "ul94_rating": ""
}
"""


def parse_datasheet(
    raw_content: str,
    manufacturer: str = "",
    polymer_family: str = "",
    grade: str = "",
    source_url: str = "",
) -> tuple[Optional[DatasheetRecord], list[str]]:
    """
    Send raw content to LLaMA 3.1 via HuggingFace Inference API and
    parse the response into a DatasheetRecord.

    Returns (record, errors).
    """
    errors: list[str] = []

    if not raw_content.strip():
        errors.append("No raw content to parse.")
        return None, errors

    # Build the user prompt
    context_hint = ""
    if manufacturer or polymer_family or grade:
        context_hint = (
            f"\nThe user is looking for: Manufacturer={manufacturer}, "
            f"Polymer Family={polymer_family}, Grade={grade}.\n"
            "Focus extraction on this specific material.\n"
        )

    user_prompt = (
        f"{context_hint}\n"
        f"Extract the polymer datasheet properties from the following raw text:\n\n"
        f"{raw_content}"
    )

    # Call HuggingFace Inference API
    try:
        client = InferenceClient(
            model=config.HF_MODEL_ID,
            token=config.HF_TOKEN,
        )

        response = client.chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=config.LLM_MAX_NEW_TOKENS,
            temperature=config.LLM_TEMPERATURE,
        )

        raw_response = response.choices[0].message.content
        logger.info("LLM response length: %d chars", len(raw_response))

    except Exception as exc:
        errors.append(f"LLM inference failed: {exc}")
        logger.error("LLM inference failed: %s", exc)
        return None, errors

    # Parse JSON from response
    record = _extract_json_to_record(raw_response, source_url, errors)
    return record, errors


def _extract_json_to_record(
    raw_response: str,
    source_url: str,
    errors: list[str],
) -> Optional[DatasheetRecord]:
    """
    Extract JSON from the LLM response (handles markdown code blocks)
    and convert to a DatasheetRecord.
    """
    # Try to find JSON in the response
    json_str = raw_response.strip()

    # Remove markdown code block wrappers if present
    code_block_match = re.search(
        r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL
    )
    if code_block_match:
        json_str = code_block_match.group(1).strip()

    # Try to find a JSON object
    brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
    if brace_match:
        json_str = brace_match.group(0)

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as exc:
        errors.append(f"Failed to parse JSON from LLM response: {exc}")
        logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
        return None

    if not isinstance(data, dict):
        errors.append("LLM response is not a JSON object.")
        return None

    # Set source URL
    data["source_url"] = source_url

    # Build DatasheetRecord, ignoring unknown fields
    valid_fields = set(DatasheetRecord.model_fields.keys())
    filtered = {k: str(v) for k, v in data.items() if k in valid_fields}

    try:
        record = DatasheetRecord(**filtered)
        return record
    except Exception as exc:
        errors.append(f"Failed to create DatasheetRecord: {exc}")
        return None


def parse_uploaded_text(
    text: str,
    source_label: str = "user_upload",
) -> tuple[Optional[DatasheetRecord], list[str]]:
    """
    Parse a user-uploaded datasheet text (e.g., from PDF extraction).
    """
    return parse_datasheet(
        raw_content=text,
        source_url=source_label,
    )