""" LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1). Takes raw web content or uploaded text and extracts structured polymer datasheet properties. """ from __future__ import annotations import json import logging import re from typing import Optional from huggingface_hub import InferenceClient import config from models import DatasheetRecord logger = logging.getLogger(__name__) # ── System prompt for structured extraction ────────────────────────────────── SYSTEM_PROMPT = """\ You are an expert polymer materials scientist and data extraction specialist. Your task is to extract technical datasheet properties from the provided raw text and return them as a JSON object. RULES: 1. Extract ONLY information explicitly stated in the source text. 2. If a property is not found, leave the value as an empty string "". 3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³"). 4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C"). 5. If multiple grades/variants exist, pick the one that best matches the query. 6. Return ONLY valid JSON — no markdown, no extra text, no code blocks. Return a JSON object with exactly these keys: { "material_name": "", "trade_name": "", "manufacturer": "", "polymer_family": "", "grade": "", "description": "", "processing_method": "", "features": "", "applications": "", "tensile_strength_mpa": "", "tensile_modulus_mpa": "", "elongation_at_break_pct": "", "flexural_strength_mpa": "", "flexural_modulus_mpa": "", "impact_strength_charpy_kj_m2": "", "impact_strength_izod_j_m": "", "hardness_shore_d": "", "hardness_rockwell": "", "compressive_strength_mpa": "", "melting_temperature_c": "", "glass_transition_temperature_c": "", "heat_deflection_temperature_c": "", "vicat_softening_temperature_c": "", "continuous_service_temperature_c": "", "thermal_conductivity_w_mk": "", "coefficient_of_thermal_expansion_um_mk": "", "flammability_rating": "", "density_g_cm3": "", "melt_flow_index_g_10min": "", "water_absorption_pct": "", "moisture_absorption_pct": "", "specific_gravity": "", "transparency": "", "color": "", "dielectric_strength_kv_mm": "", "dielectric_constant": "", "volume_resistivity_ohm_cm": "", "surface_resistivity_ohm": "", "dissipation_factor": "", "acid_resistance": "", "alkali_resistance": "", "solvent_resistance": "", "uv_resistance": "", "weatherability": "", "fda_approved": "", "rohs_compliant": "", "reach_compliant": "", "ul94_rating": "" } """ def parse_datasheet( raw_content: str, manufacturer: str = "", polymer_family: str = "", grade: str = "", source_url: str = "", ) -> tuple[Optional[DatasheetRecord], list[str]]: """ Send raw content to LLaMA 3.1 via HuggingFace Inference API and parse the response into a DatasheetRecord. Returns (record, errors). """ errors: list[str] = [] if not raw_content.strip(): errors.append("No raw content to parse.") return None, errors # Build the user prompt context_hint = "" if manufacturer or polymer_family or grade: context_hint = ( f"\nThe user is looking for: Manufacturer={manufacturer}, " f"Polymer Family={polymer_family}, Grade={grade}.\n" "Focus extraction on this specific material.\n" ) user_prompt = ( f"{context_hint}\n" f"Extract the polymer datasheet properties from the following raw text:\n\n" f"{raw_content}" ) # Call HuggingFace Inference API try: client = InferenceClient( model=config.HF_MODEL_ID, token=config.HF_TOKEN, ) response = client.chat_completion( messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], max_tokens=config.LLM_MAX_NEW_TOKENS, temperature=config.LLM_TEMPERATURE, ) raw_response = response.choices[0].message.content logger.info("LLM response length: %d chars", len(raw_response)) except Exception as exc: errors.append(f"LLM inference failed: {exc}") logger.error("LLM inference failed: %s", exc) return None, errors # Parse JSON from response record = _extract_json_to_record(raw_response, source_url, errors) return record, errors def _extract_json_to_record( raw_response: str, source_url: str, errors: list[str], ) -> Optional[DatasheetRecord]: """ Extract JSON from the LLM response (handles markdown code blocks) and convert to a DatasheetRecord. """ # Try to find JSON in the response json_str = raw_response.strip() # Remove markdown code block wrappers if present code_block_match = re.search( r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL ) if code_block_match: json_str = code_block_match.group(1).strip() # Try to find a JSON object brace_match = re.search(r"\{.*\}", json_str, re.DOTALL) if brace_match: json_str = brace_match.group(0) try: data = json.loads(json_str) except json.JSONDecodeError as exc: errors.append(f"Failed to parse JSON from LLM response: {exc}") logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500]) return None if not isinstance(data, dict): errors.append("LLM response is not a JSON object.") return None # Set source URL data["source_url"] = source_url # Build DatasheetRecord, ignoring unknown fields valid_fields = set(DatasheetRecord.model_fields.keys()) filtered = {k: str(v) for k, v in data.items() if k in valid_fields} try: record = DatasheetRecord(**filtered) return record except Exception as exc: errors.append(f"Failed to create DatasheetRecord: {exc}") return None def parse_uploaded_text( text: str, source_label: str = "user_upload", ) -> tuple[Optional[DatasheetRecord], list[str]]: """ Parse a user-uploaded datasheet text (e.g., from PDF extraction). """ return parse_datasheet( raw_content=text, source_url=source_label, )