polymer-datasheet-agent / llm_parser.py
ravimohan19's picture
Upload llm_parser.py with huggingface_hub
7b161f7 verified
"""
LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).
Takes raw web content or uploaded text and extracts structured polymer
datasheet properties.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Optional
from huggingface_hub import InferenceClient
import config
from models import DatasheetRecord
logger = logging.getLogger(__name__)
# ── System prompt for structured extraction ──────────────────────────────────
SYSTEM_PROMPT = """\
You are an expert polymer materials scientist and data extraction specialist.
Your task is to extract technical datasheet properties from the provided raw text
and return them as a JSON object.
RULES:
1. Extract ONLY information explicitly stated in the source text.
2. If a property is not found, leave the value as an empty string "".
3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³").
4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C").
5. If multiple grades/variants exist, pick the one that best matches the query.
6. Return ONLY valid JSON — no markdown, no extra text, no code blocks.
Return a JSON object with exactly these keys:
{
"material_name": "",
"trade_name": "",
"manufacturer": "",
"polymer_family": "",
"grade": "",
"description": "",
"processing_method": "",
"features": "",
"applications": "",
"tensile_strength_mpa": "",
"tensile_modulus_mpa": "",
"elongation_at_break_pct": "",
"flexural_strength_mpa": "",
"flexural_modulus_mpa": "",
"impact_strength_charpy_kj_m2": "",
"impact_strength_izod_j_m": "",
"hardness_shore_d": "",
"hardness_rockwell": "",
"compressive_strength_mpa": "",
"melting_temperature_c": "",
"glass_transition_temperature_c": "",
"heat_deflection_temperature_c": "",
"vicat_softening_temperature_c": "",
"continuous_service_temperature_c": "",
"thermal_conductivity_w_mk": "",
"coefficient_of_thermal_expansion_um_mk": "",
"flammability_rating": "",
"density_g_cm3": "",
"melt_flow_index_g_10min": "",
"water_absorption_pct": "",
"moisture_absorption_pct": "",
"specific_gravity": "",
"transparency": "",
"color": "",
"dielectric_strength_kv_mm": "",
"dielectric_constant": "",
"volume_resistivity_ohm_cm": "",
"surface_resistivity_ohm": "",
"dissipation_factor": "",
"acid_resistance": "",
"alkali_resistance": "",
"solvent_resistance": "",
"uv_resistance": "",
"weatherability": "",
"fda_approved": "",
"rohs_compliant": "",
"reach_compliant": "",
"ul94_rating": ""
}
"""
def parse_datasheet(
raw_content: str,
manufacturer: str = "",
polymer_family: str = "",
grade: str = "",
source_url: str = "",
) -> tuple[Optional[DatasheetRecord], list[str]]:
"""
Send raw content to LLaMA 3.1 via HuggingFace Inference API and
parse the response into a DatasheetRecord.
Returns (record, errors).
"""
errors: list[str] = []
if not raw_content.strip():
errors.append("No raw content to parse.")
return None, errors
# Build the user prompt
context_hint = ""
if manufacturer or polymer_family or grade:
context_hint = (
f"\nThe user is looking for: Manufacturer={manufacturer}, "
f"Polymer Family={polymer_family}, Grade={grade}.\n"
"Focus extraction on this specific material.\n"
)
user_prompt = (
f"{context_hint}\n"
f"Extract the polymer datasheet properties from the following raw text:\n\n"
f"{raw_content}"
)
# Call HuggingFace Inference API
try:
client = InferenceClient(
model=config.HF_MODEL_ID,
token=config.HF_TOKEN,
)
response = client.chat_completion(
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
max_tokens=config.LLM_MAX_NEW_TOKENS,
temperature=config.LLM_TEMPERATURE,
)
raw_response = response.choices[0].message.content
logger.info("LLM response length: %d chars", len(raw_response))
except Exception as exc:
errors.append(f"LLM inference failed: {exc}")
logger.error("LLM inference failed: %s", exc)
return None, errors
# Parse JSON from response
record = _extract_json_to_record(raw_response, source_url, errors)
return record, errors
def _extract_json_to_record(
raw_response: str,
source_url: str,
errors: list[str],
) -> Optional[DatasheetRecord]:
"""
Extract JSON from the LLM response (handles markdown code blocks)
and convert to a DatasheetRecord.
"""
# Try to find JSON in the response
json_str = raw_response.strip()
# Remove markdown code block wrappers if present
code_block_match = re.search(
r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL
)
if code_block_match:
json_str = code_block_match.group(1).strip()
# Try to find a JSON object
brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
if brace_match:
json_str = brace_match.group(0)
try:
data = json.loads(json_str)
except json.JSONDecodeError as exc:
errors.append(f"Failed to parse JSON from LLM response: {exc}")
logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
return None
if not isinstance(data, dict):
errors.append("LLM response is not a JSON object.")
return None
# Set source URL
data["source_url"] = source_url
# Build DatasheetRecord, ignoring unknown fields
valid_fields = set(DatasheetRecord.model_fields.keys())
filtered = {k: str(v) for k, v in data.items() if k in valid_fields}
try:
record = DatasheetRecord(**filtered)
return record
except Exception as exc:
errors.append(f"Failed to create DatasheetRecord: {exc}")
return None
def parse_uploaded_text(
text: str,
source_label: str = "user_upload",
) -> tuple[Optional[DatasheetRecord], list[str]]:
"""
Parse a user-uploaded datasheet text (e.g., from PDF extraction).
"""
return parse_datasheet(
raw_content=text,
source_url=source_label,
)