import json import re import streamlit as st import pandas as pd from typing import Any, List from langchain_groq import ChatGroq import os from dotenv import load_dotenv load_dotenv() # --- 1. Config --- DEFAULT_FIELDS = [{"name": "number", "datatype": "int", "description": "Description of the item"}] TYPE_MAPPING_STR = {"int": "int", "float": "float", "str": "str"} def normalize_fields(fields: Any) -> List[dict]: """Convert DataFrame/list input into a clean list of field dicts.""" try: if isinstance(fields, pd.DataFrame): parsed = fields.fillna("").to_dict(orient="records") elif isinstance(fields, list): parsed = fields else: return [] cleaned = [] for item in parsed: if not isinstance(item, dict): continue cleaned.append( { "name": str(item.get("name", "")).strip(), "datatype": str(item.get("datatype", "str")).strip() or "str", "description": str(item.get("description", "")).strip(), } ) return cleaned except Exception: return [] def generate_schema_json(fields: Any) -> str: """Generate JSON schema-like object from field rows.""" normalized_fields = normalize_fields(fields) properties = {} required = [] for f in normalized_fields: field_name = f.get("name", "").strip() if not field_name: continue dtype = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str") properties[field_name] = { "type": dtype, "description": f.get("description", ""), "nullable": True, } required.append(field_name) schema = { "type": "object", "properties": properties, "required": required, "additionalProperties": False, } return json.dumps(schema, indent=2) def is_valid_text(text: str) -> bool: """Guardrail: reject empty or whitespace-only input.""" return bool((text or "").strip()) def parse_json_from_text(text: str) -> dict | None: """Extract JSON object from model response text.""" try: # 1) direct JSON parsed = json.loads(text) return parsed if isinstance(parsed, dict) else None except Exception: pass try: # 2) fenced code block fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL | re.IGNORECASE) if fenced: parsed = json.loads(fenced.group(1)) return parsed if isinstance(parsed, dict) else None except Exception: pass try: # 3) first object-looking block obj = re.search(r"(\{.*\})", text, flags=re.DOTALL) if obj: parsed = json.loads(obj.group(1)) return parsed if isinstance(parsed, dict) else None except Exception: pass return None def cast_to_dtype(value: Any, dtype: str) -> Any: if value is None: return None try: if dtype == "int": return int(value) if dtype == "float": return float(value) return str(value) except Exception: return None def extract_structured(fields: Any, unstructured_text: str) -> dict | str: """ Extract structured data from unstructured text based on user-defined fields. Args: fields: A list of dicts or a pd.DataFrame with columns [name, datatype, description]. unstructured_text: Raw text to extract data from. Returns: A JSON dict on success, or an error string. """ if not is_valid_text(unstructured_text): return "Input text is empty. Please provide some text to extract from." # Build schema from user-defined fields normalized_fields = normalize_fields(fields) schema_properties = {} field_order = [] for f in normalized_fields: field_name = f.get("name", "").strip() if not field_name: continue if not field_name.isidentifier(): return f"Invalid field name '{field_name}'. Use letters, numbers, and underscores only." field_type = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str") schema_properties[field_name] = { "type": field_type, "description": f.get("description", ""), } field_order.append(field_name) if not schema_properties: return "Please add at least one valid field before extraction." # Initialize LLM llm = ChatGroq( model="openai/gpt-oss-120b", temperature=0, api_key=os.getenv("GROQ_API_KEY"), ) # Extract structured data try: schema_json = json.dumps(schema_properties, indent=2) response = llm.invoke( "Extract information from the text below.\n" "Return ONLY one valid JSON object and no extra text.\n" "Use exactly the fields in this schema.\n" "If a value is missing, return null.\n\n" f"Schema:\n{schema_json}\n\n" f"Text:\n{unstructured_text}" ) content = response.content if hasattr(response, "content") else str(response) if isinstance(content, list): content = "".join( part.get("text", "") if isinstance(part, dict) else str(part) for part in content ) parsed = parse_json_from_text(str(content)) if not parsed: return f"Could not parse JSON from model output: {content}" # Coerce output to requested schema and order cleaned = {} for field_name in field_order: dtype = schema_properties[field_name]["type"] cleaned[field_name] = cast_to_dtype(parsed.get(field_name), dtype) return cleaned except Exception as e: return f"Error during extraction: {str(e)}" def render_styles(): st.markdown( """ """, unsafe_allow_html=True, ) def main(): st.set_page_config(page_title="Dynamic Extraction", layout="wide") render_styles() st.markdown('