Spaces:

ravimohan19
/

polymer-datasheet-agent

Sleeping

App Files Files Community

polymer-datasheet-agent / llm_parser.py

ravimohan19

Upload llm_parser.py with huggingface_hub

7b161f7 verified 28 days ago

raw

history blame contribute delete

6.62 kB

	"""
	LLM-based datasheet parser using HuggingFace Inference API (LLaMA 3.1).

	Takes raw web content or uploaded text and extracts structured polymer
	datasheet properties.
	"""

	from __future__ import annotations

	import json
	import logging
	import re
	from typing import Optional

	from huggingface_hub import InferenceClient

	import config
	from models import DatasheetRecord

	logger = logging.getLogger(__name__)

	# ── System prompt for structured extraction ──────────────────────────────────

	SYSTEM_PROMPT = """\
	You are an expert polymer materials scientist and data extraction specialist.
	Your task is to extract technical datasheet properties from the provided raw text
	and return them as a JSON object.

	RULES:
	1. Extract ONLY information explicitly stated in the source text.
	2. If a property is not found, leave the value as an empty string "".
	3. Include units in the value where available (e.g., "65 MPa", "1.14 g/cm³").
	4. For properties with ranges, format as "min - max unit" (e.g., "220 - 260 °C").
	5. If multiple grades/variants exist, pick the one that best matches the query.
	6. Return ONLY valid JSON — no markdown, no extra text, no code blocks.

	Return a JSON object with exactly these keys:

	{
	"material_name": "",
	"trade_name": "",
	"manufacturer": "",
	"polymer_family": "",
	"grade": "",
	"description": "",
	"processing_method": "",
	"features": "",
	"applications": "",
	"tensile_strength_mpa": "",
	"tensile_modulus_mpa": "",
	"elongation_at_break_pct": "",
	"flexural_strength_mpa": "",
	"flexural_modulus_mpa": "",
	"impact_strength_charpy_kj_m2": "",
	"impact_strength_izod_j_m": "",
	"hardness_shore_d": "",
	"hardness_rockwell": "",
	"compressive_strength_mpa": "",
	"melting_temperature_c": "",
	"glass_transition_temperature_c": "",
	"heat_deflection_temperature_c": "",
	"vicat_softening_temperature_c": "",
	"continuous_service_temperature_c": "",
	"thermal_conductivity_w_mk": "",
	"coefficient_of_thermal_expansion_um_mk": "",
	"flammability_rating": "",
	"density_g_cm3": "",
	"melt_flow_index_g_10min": "",
	"water_absorption_pct": "",
	"moisture_absorption_pct": "",
	"specific_gravity": "",
	"transparency": "",
	"color": "",
	"dielectric_strength_kv_mm": "",
	"dielectric_constant": "",
	"volume_resistivity_ohm_cm": "",
	"surface_resistivity_ohm": "",
	"dissipation_factor": "",
	"acid_resistance": "",
	"alkali_resistance": "",
	"solvent_resistance": "",
	"uv_resistance": "",
	"weatherability": "",
	"fda_approved": "",
	"rohs_compliant": "",
	"reach_compliant": "",
	"ul94_rating": ""
	}
	"""


	def parse_datasheet(
	raw_content: str,
	manufacturer: str = "",
	polymer_family: str = "",
	grade: str = "",
	source_url: str = "",
	) -> tuple[Optional[DatasheetRecord], list[str]]:
	"""
	Send raw content to LLaMA 3.1 via HuggingFace Inference API and
	parse the response into a DatasheetRecord.

	Returns (record, errors).
	"""
	errors: list[str] = []

	if not raw_content.strip():
	errors.append("No raw content to parse.")
	return None, errors

	# Build the user prompt
	context_hint = ""
	if manufacturer or polymer_family or grade:
	context_hint = (
	f"\nThe user is looking for: Manufacturer={manufacturer}, "
	f"Polymer Family={polymer_family}, Grade={grade}.\n"
	"Focus extraction on this specific material.\n"
	)

	user_prompt = (
	f"{context_hint}\n"
	f"Extract the polymer datasheet properties from the following raw text:\n\n"
	f"{raw_content}"
	)

	# Call HuggingFace Inference API
	try:
	client = InferenceClient(
	model=config.HF_MODEL_ID,
	token=config.HF_TOKEN,
	)

	response = client.chat_completion(
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	max_tokens=config.LLM_MAX_NEW_TOKENS,
	temperature=config.LLM_TEMPERATURE,
	)

	raw_response = response.choices[0].message.content
	logger.info("LLM response length: %d chars", len(raw_response))

	except Exception as exc:
	errors.append(f"LLM inference failed: {exc}")
	logger.error("LLM inference failed: %s", exc)
	return None, errors

	# Parse JSON from response
	record = _extract_json_to_record(raw_response, source_url, errors)
	return record, errors


	def _extract_json_to_record(
	raw_response: str,
	source_url: str,
	errors: list[str],
	) -> Optional[DatasheetRecord]:
	"""
	Extract JSON from the LLM response (handles markdown code blocks)
	and convert to a DatasheetRecord.
	"""
	# Try to find JSON in the response
	json_str = raw_response.strip()

	# Remove markdown code block wrappers if present
	code_block_match = re.search(
	r"```(?:json)?\s\n?(.?)\n?```", json_str, re.DOTALL
	)
	if code_block_match:
	json_str = code_block_match.group(1).strip()

	# Try to find a JSON object
	brace_match = re.search(r"\{.*\}", json_str, re.DOTALL)
	if brace_match:
	json_str = brace_match.group(0)

	try:
	data = json.loads(json_str)
	except json.JSONDecodeError as exc:
	errors.append(f"Failed to parse JSON from LLM response: {exc}")
	logger.error("JSON parse error: %s\nRaw response:\n%s", exc, raw_response[:500])
	return None

	if not isinstance(data, dict):
	errors.append("LLM response is not a JSON object.")
	return None

	# Set source URL
	data["source_url"] = source_url

	# Build DatasheetRecord, ignoring unknown fields
	valid_fields = set(DatasheetRecord.model_fields.keys())
	filtered = {k: str(v) for k, v in data.items() if k in valid_fields}

	try:
	record = DatasheetRecord(**filtered)
	return record
	except Exception as exc:
	errors.append(f"Failed to create DatasheetRecord: {exc}")
	return None


	def parse_uploaded_text(
	text: str,
	source_label: str = "user_upload",
	) -> tuple[Optional[DatasheetRecord], list[str]]:
	"""
	Parse a user-uploaded datasheet text (e.g., from PDF extraction).
	"""
	return parse_datasheet(
	raw_content=text,
	source_url=source_label,
	)