Spaces:
Sleeping
Sleeping
| """ | |
| LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1). | |
| Takes raw web content or uploaded text and extracts structured polymer | |
| datasheet properties. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| from typing import Optional | |
| from huggingface_hub import InferenceClient | |
| import config | |
| from models import DatasheetRecord | |
| logger = logging.getLogger(__name__) | |
| # ── System prompt for structured extraction ────────────────────────────────── | |
| SYSTEM_PROMPT = """\ | |
| You are an expert polymer materials scientist and data extraction specialist. | |
| Your task is to extract technical datasheet properties from the provided raw text | |
| and return them as a JSON object. | |
| RULES: | |
| 1. Extract ONLY information explicitly stated in the source text. | |
| 2. If a property is not found, leave the value as an empty string "". | |
| 3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³"). | |
| 4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C"). | |
| 5. If multiple grades/variants exist, pick the one that best matches the query. | |
| 6. Return ONLY valid JSON — no markdown, no extra text, no code blocks. | |
| Return a JSON object with exactly these keys: | |
| { | |
| "material_name": "", | |
| "trade_name": "", | |
| "manufacturer": "", | |
| "polymer_family": "", | |
| "grade": "", | |
| "description": "", | |
| "processing_method": "", | |
| "features": "", | |
| "applications": "", | |
| "tensile_strength_mpa": "", | |
| "tensile_modulus_mpa": "", | |
| "elongation_at_break_pct": "", | |
| "flexural_strength_mpa": "", | |
| "flexural_modulus_mpa": "", | |
| "impact_strength_charpy_kj_m2": "", | |
| "impact_strength_izod_j_m": "", | |
| "hardness_shore_d": "", | |
| "hardness_rockwell": "", | |
| "compressive_strength_mpa": "", | |
| "melting_temperature_c": "", | |
| "glass_transition_temperature_c": "", | |
| "heat_deflection_temperature_c": "", | |
| "vicat_softening_temperature_c": "", | |
| "continuous_service_temperature_c": "", | |
| "thermal_conductivity_w_mk": "", | |
| "coefficient_of_thermal_expansion_um_mk": "", | |
| "flammability_rating": "", | |
| "density_g_cm3": "", | |
| "melt_flow_index_g_10min": "", | |
| "water_absorption_pct": "", | |
| "moisture_absorption_pct": "", | |
| "specific_gravity": "", | |
| "transparency": "", | |
| "color": "", | |
| "dielectric_strength_kv_mm": "", | |
| "dielectric_constant": "", | |
| "volume_resistivity_ohm_cm": "", | |
| "surface_resistivity_ohm": "", | |
| "dissipation_factor": "", | |
| "acid_resistance": "", | |
| "alkali_resistance": "", | |
| "solvent_resistance": "", | |
| "uv_resistance": "", | |
| "weatherability": "", | |
| "fda_approved": "", | |
| "rohs_compliant": "", | |
| "reach_compliant": "", | |
| "ul94_rating": "" | |
| } | |
| """ | |
| def parse_datasheet( | |
| raw_content: str, | |
| manufacturer: str = "", | |
| polymer_family: str = "", | |
| grade: str = "", | |
| source_url: str = "", | |
| ) -> tuple[Optional[DatasheetRecord], list[str]]: | |
| """ | |
| Send raw content to LLaMA 3.1 via HuggingFace Inference API and | |
| parse the response into a DatasheetRecord. | |
| Returns (record, errors). | |
| """ | |
| errors: list[str] = [] | |
| if not raw_content.strip(): | |
| errors.append("No raw content to parse.") | |
| return None, errors | |
| # Build the user prompt | |
| context_hint = "" | |
| if manufacturer or polymer_family or grade: | |
| context_hint = ( | |
| f"\nThe user is looking for: Manufacturer={manufacturer}, " | |
| f"Polymer Family={polymer_family}, Grade={grade}.\n" | |
| "Focus extraction on this specific material.\n" | |
| ) | |
| user_prompt = ( | |
| f"{context_hint}\n" | |
| f"Extract the polymer datasheet properties from the following raw text:\n\n" | |
| f"{raw_content}" | |
| ) | |
| # Call HuggingFace Inference API | |
| try: | |
| client = InferenceClient( | |
| model=config.HF_MODEL_ID, | |
| token=config.HF_TOKEN, | |
| ) | |
| response = client.chat_completion( | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| max_tokens=config.LLM_MAX_NEW_TOKENS, | |
| temperature=config.LLM_TEMPERATURE, | |
| ) | |
| raw_response = response.choices[0].message.content | |
| logger.info("LLM response length: %d chars", len(raw_response)) | |
| except Exception as exc: | |
| errors.append(f"LLM inference failed: {exc}") | |
| logger.error("LLM inference failed: %s", exc) | |
| return None, errors | |
| # Parse JSON from response | |
| record = _extract_json_to_record(raw_response, source_url, errors) | |
| return record, errors | |
| def _extract_json_to_record( | |
| raw_response: str, | |
| source_url: str, | |
| errors: list[str], | |
| ) -> Optional[DatasheetRecord]: | |
| """ | |
| Extract JSON from the LLM response (handles markdown code blocks) | |
| and convert to a DatasheetRecord. | |
| """ | |
| # Try to find JSON in the response | |
| json_str = raw_response.strip() | |
| # Remove markdown code block wrappers if present | |
| code_block_match = re.search( | |
| r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL | |
| ) | |
| if code_block_match: | |
| json_str = code_block_match.group(1).strip() | |
| # Try to find a JSON object | |
| brace_match = re.search(r"\{.*\}", json_str, re.DOTALL) | |
| if brace_match: | |
| json_str = brace_match.group(0) | |
| try: | |
| data = json.loads(json_str) | |
| except json.JSONDecodeError as exc: | |
| errors.append(f"Failed to parse JSON from LLM response: {exc}") | |
| logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500]) | |
| return None | |
| if not isinstance(data, dict): | |
| errors.append("LLM response is not a JSON object.") | |
| return None | |
| # Set source URL | |
| data["source_url"] = source_url | |
| # Build DatasheetRecord, ignoring unknown fields | |
| valid_fields = set(DatasheetRecord.model_fields.keys()) | |
| filtered = {k: str(v) for k, v in data.items() if k in valid_fields} | |
| try: | |
| record = DatasheetRecord(**filtered) | |
| return record | |
| except Exception as exc: | |
| errors.append(f"Failed to create DatasheetRecord: {exc}") | |
| return None | |
| def parse_uploaded_text( | |
| text: str, | |
| source_label: str = "user_upload", | |
| ) -> tuple[Optional[DatasheetRecord], list[str]]: | |
| """ | |
| Parse a user-uploaded datasheet text (e.g., from PDF extraction). | |
| """ | |
| return parse_datasheet( | |
| raw_content=text, | |
| source_url=source_label, | |
| ) | |