Spaces:
Sleeping
Sleeping
File size: 6,619 Bytes
7b161f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).
Takes raw web content or uploaded text and extracts structured polymer
datasheet properties.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Optional
from huggingface_hub import InferenceClient
import config
from models import DatasheetRecord
logger = logging.getLogger(__name__)
# ββ System prompt for structured extraction ββββββββββββββββββββββββββββββββββ
SYSTEM_PROMPT = """\
You are an expert polymer materials scientist and data extraction specialist.
Your task is to extract technical datasheet properties from the provided raw text
and return them as a JSON object.
RULES:
1. Extract ONLY information explicitly stated in the source text.
2. If a property is not found, leave the value as an empty string "".
3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cmΒ³").
4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 Β°C").
5. If multiple grades/variants exist, pick the one that best matches the query.
6. Return ONLY valid JSON β no markdown, no extra text, no code blocks.
Return a JSON object with exactly these keys:
{
"material_name": "",
"trade_name": "",
"manufacturer": "",
"polymer_family": "",
"grade": "",
"description": "",
"processing_method": "",
"features": "",
"applications": "",
"tensile_strength_mpa": "",
"tensile_modulus_mpa": "",
"elongation_at_break_pct": "",
"flexural_strength_mpa": "",
"flexural_modulus_mpa": "",
"impact_strength_charpy_kj_m2": "",
"impact_strength_izod_j_m": "",
"hardness_shore_d": "",
"hardness_rockwell": "",
"compressive_strength_mpa": "",
"melting_temperature_c": "",
"glass_transition_temperature_c": "",
"heat_deflection_temperature_c": "",
"vicat_softening_temperature_c": "",
"continuous_service_temperature_c": "",
"thermal_conductivity_w_mk": "",
"coefficient_of_thermal_expansion_um_mk": "",
"flammability_rating": "",
"density_g_cm3": "",
"melt_flow_index_g_10min": "",
"water_absorption_pct": "",
"moisture_absorption_pct": "",
"specific_gravity": "",
"transparency": "",
"color": "",
"dielectric_strength_kv_mm": "",
"dielectric_constant": "",
"volume_resistivity_ohm_cm": "",
"surface_resistivity_ohm": "",
"dissipation_factor": "",
"acid_resistance": "",
"alkali_resistance": "",
"solvent_resistance": "",
"uv_resistance": "",
"weatherability": "",
"fda_approved": "",
"rohs_compliant": "",
"reach_compliant": "",
"ul94_rating": ""
}
"""
def parse_datasheet(
raw_content: str,
manufacturer: str = "",
polymer_family: str = "",
grade: str = "",
source_url: str = "",
) -> tuple[Optional[DatasheetRecord], list[str]]:
"""
Send raw content to LLaMA 3.1 via HuggingFace Inference API and
parse the response into a DatasheetRecord.
Returns (record, errors).
"""
errors: list[str] = []
if not raw_content.strip():
errors.append("No raw content to parse.")
return None, errors
# Build the user prompt
context_hint = ""
if manufacturer or polymer_family or grade:
context_hint = (
f"\nThe user is looking for: Manufacturer={manufacturer}, "
f"Polymer Family={polymer_family}, Grade={grade}.\n"
"Focus extraction on this specific material.\n"
)
user_prompt = (
f"{context_hint}\n"
f"Extract the polymer datasheet properties from the following raw text:\n\n"
f"{raw_content}"
)
# Call HuggingFace Inference API
try:
client = InferenceClient(
model=config.HF_MODEL_ID,
token=config.HF_TOKEN,
)
response = client.chat_completion(
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
max_tokens=config.LLM_MAX_NEW_TOKENS,
temperature=config.LLM_TEMPERATURE,
)
raw_response = response.choices[0].message.content
logger.info("LLM response length: %d chars", len(raw_response))
except Exception as exc:
errors.append(f"LLM inference failed: {exc}")
logger.error("LLM inference failed: %s", exc)
return None, errors
# Parse JSON from response
record = _extract_json_to_record(raw_response, source_url, errors)
return record, errors
def _extract_json_to_record(
raw_response: str,
source_url: str,
errors: list[str],
) -> Optional[DatasheetRecord]:
"""
Extract JSON from the LLM response (handles markdown code blocks)
and convert to a DatasheetRecord.
"""
# Try to find JSON in the response
json_str = raw_response.strip()
# Remove markdown code block wrappers if present
code_block_match = re.search(
r"```(?:json)?\s*\n?(.*?)\n?```", json_str, re.DOTALL
)
if code_block_match:
json_str = code_block_match.group(1).strip()
# Try to find a JSON object
brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
if brace_match:
json_str = brace_match.group(0)
try:
data = json.loads(json_str)
except json.JSONDecodeError as exc:
errors.append(f"Failed to parse JSON from LLM response: {exc}")
logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
return None
if not isinstance(data, dict):
errors.append("LLM response is not a JSON object.")
return None
# Set source URL
data["source_url"] = source_url
# Build DatasheetRecord, ignoring unknown fields
valid_fields = set(DatasheetRecord.model_fields.keys())
filtered = {k: str(v) for k, v in data.items() if k in valid_fields}
try:
record = DatasheetRecord(**filtered)
return record
except Exception as exc:
errors.append(f"Failed to create DatasheetRecord: {exc}")
return None
def parse_uploaded_text(
text: str,
source_label: str = "user_upload",
) -> tuple[Optional[DatasheetRecord], list[str]]:
"""
Parse a user-uploaded datasheet text (e.g., from PDF extraction).
"""
return parse_datasheet(
raw_content=text,
source_url=source_label,
)
|