Spaces:

ravimohan19
/

polymer-datasheet-agent

Sleeping

File size: 6,619 Bytes

7b161f7

"""

LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).



Takes raw web content or uploaded text and extracts structured polymer

datasheet properties.

"""

from __future__ import annotations

import json
import logging
import re
from typing import Optional

from huggingface_hub import InferenceClient

import config
from models import DatasheetRecord

logger = logging.getLogger(__name__)

# ── System prompt for structured extraction ──────────────────────────────────

SYSTEM_PROMPT = """\

You are an expert polymer materials scientist and data extraction specialist.

Your task is to extract technical datasheet properties from the provided raw text

and return them as a JSON object.



RULES:

1. Extract ONLY information explicitly stated in the source text.

2. If a property is not found, leave the value as an empty string "".

3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³").

4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C").

5. If multiple grades/variants exist, pick the one that best matches the query.

6. Return ONLY valid JSON — no markdown, no extra text, no code blocks.



Return a JSON object with exactly these keys:



{

  "material_name": "",

  "trade_name": "",

  "manufacturer": "",

  "polymer_family": "",

  "grade": "",

  "description": "",

  "processing_method": "",

  "features": "",

  "applications": "",

  "tensile_strength_mpa": "",

  "tensile_modulus_mpa": "",

  "elongation_at_break_pct": "",

  "flexural_strength_mpa": "",

  "flexural_modulus_mpa": "",

  "impact_strength_charpy_kj_m2": "",

  "impact_strength_izod_j_m": "",

  "hardness_shore_d": "",

  "hardness_rockwell": "",

  "compressive_strength_mpa": "",

  "melting_temperature_c": "",

  "glass_transition_temperature_c": "",

  "heat_deflection_temperature_c": "",

  "vicat_softening_temperature_c": "",

  "continuous_service_temperature_c": "",

  "thermal_conductivity_w_mk": "",

  "coefficient_of_thermal_expansion_um_mk": "",

  "flammability_rating": "",

  "density_g_cm3": "",

  "melt_flow_index_g_10min": "",

  "water_absorption_pct": "",

  "moisture_absorption_pct": "",

  "specific_gravity": "",

  "transparency": "",

  "color": "",

  "dielectric_strength_kv_mm": "",

  "dielectric_constant": "",

  "volume_resistivity_ohm_cm": "",

  "surface_resistivity_ohm": "",

  "dissipation_factor": "",

  "acid_resistance": "",

  "alkali_resistance": "",

  "solvent_resistance": "",

  "uv_resistance": "",

  "weatherability": "",

  "fda_approved": "",

  "rohs_compliant": "",

  "reach_compliant": "",

  "ul94_rating": ""

}

"""


def parse_datasheet(

    raw_content: str,

    manufacturer: str = "",

    polymer_family: str = "",

    grade: str = "",

    source_url: str = "",

) -> tuple[Optional[DatasheetRecord], list[str]]:
    """

    Send raw content to LLaMA 3.1 via HuggingFace Inference API and

    parse the response into a DatasheetRecord.



    Returns (record, errors).

    """
    errors: list[str] = []

    if not raw_content.strip():
        errors.append("No raw content to parse.")
        return None, errors

    # Build the user prompt
    context_hint = ""
    if manufacturer or polymer_family or grade:
        context_hint = (
            f"\nThe user is looking for: Manufacturer={manufacturer}, "
            f"Polymer Family={polymer_family}, Grade={grade}.\n"
            "Focus extraction on this specific material.\n"
        )

    user_prompt = (
        f"{context_hint}\n"
        f"Extract the polymer datasheet properties from the following raw text:\n\n"
        f"{raw_content}"
    )

    # Call HuggingFace Inference API
    try:
        client = InferenceClient(
            model=config.HF_MODEL_ID,
            token=config.HF_TOKEN,
        )

        response = client.chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=config.LLM_MAX_NEW_TOKENS,
            temperature=config.LLM_TEMPERATURE,
        )

        raw_response = response.choices[0].message.content
        logger.info("LLM response length: %d chars", len(raw_response))

    except Exception as exc:
        errors.append(f"LLM inference failed: {exc}")
        logger.error("LLM inference failed: %s", exc)
        return None, errors

    # Parse JSON from response
    record = _extract_json_to_record(raw_response, source_url, errors)
    return record, errors


def _extract_json_to_record(

    raw_response: str,

    source_url: str,

    errors: list[str],

) -> Optional[DatasheetRecord]:
    """

    Extract JSON from the LLM response (handles markdown code blocks)

    and convert to a DatasheetRecord.

    """
    # Try to find JSON in the response
    json_str = raw_response.strip()

    # Remove markdown code block wrappers if present
    code_block_match = re.search(
        r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL
    )
    if code_block_match:
        json_str = code_block_match.group(1).strip()

    # Try to find a JSON object
    brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
    if brace_match:
        json_str = brace_match.group(0)

    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as exc:
        errors.append(f"Failed to parse JSON from LLM response: {exc}")
        logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
        return None

    if not isinstance(data, dict):
        errors.append("LLM response is not a JSON object.")
        return None

    # Set source URL
    data["source_url"] = source_url

    # Build DatasheetRecord, ignoring unknown fields
    valid_fields = set(DatasheetRecord.model_fields.keys())
    filtered = {k: str(v) for k, v in data.items() if k in valid_fields}

    try:
        record = DatasheetRecord(**filtered)
        return record
    except Exception as exc:
        errors.append(f"Failed to create DatasheetRecord: {exc}")
        return None


def parse_uploaded_text(

    text: str,

    source_label: str = "user_upload",

) -> tuple[Optional[DatasheetRecord], list[str]]:
    """

    Parse a user-uploaded datasheet text (e.g., from PDF extraction).

    """
    return parse_datasheet(
        raw_content=text,
        source_url=source_label,
    )