Structured_Extraction / src /streamlit_app.py
jarvisx17's picture
Update src/streamlit_app.py
d91bc32 verified
import json
import re
import streamlit as st
import pandas as pd
from typing import Any, List
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()
# --- 1. Config ---
DEFAULT_FIELDS = [{"name": "number", "datatype": "int", "description": "Description of the item"}]
TYPE_MAPPING_STR = {"int": "int", "float": "float", "str": "str"}
def normalize_fields(fields: Any) -> List[dict]:
"""Convert DataFrame/list input into a clean list of field dicts."""
try:
if isinstance(fields, pd.DataFrame):
parsed = fields.fillna("").to_dict(orient="records")
elif isinstance(fields, list):
parsed = fields
else:
return []
cleaned = []
for item in parsed:
if not isinstance(item, dict):
continue
cleaned.append(
{
"name": str(item.get("name", "")).strip(),
"datatype": str(item.get("datatype", "str")).strip() or "str",
"description": str(item.get("description", "")).strip(),
}
)
return cleaned
except Exception:
return []
def generate_schema_json(fields: Any) -> str:
"""Generate JSON schema-like object from field rows."""
normalized_fields = normalize_fields(fields)
properties = {}
required = []
for f in normalized_fields:
field_name = f.get("name", "").strip()
if not field_name:
continue
dtype = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str")
properties[field_name] = {
"type": dtype,
"description": f.get("description", ""),
"nullable": True,
}
required.append(field_name)
schema = {
"type": "object",
"properties": properties,
"required": required,
"additionalProperties": False,
}
return json.dumps(schema, indent=2)
def is_valid_text(text: str) -> bool:
"""Guardrail: reject empty or whitespace-only input."""
return bool((text or "").strip())
def parse_json_from_text(text: str) -> dict | None:
"""Extract JSON object from model response text."""
try:
# 1) direct JSON
parsed = json.loads(text)
return parsed if isinstance(parsed, dict) else None
except Exception:
pass
try:
# 2) fenced code block
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL | re.IGNORECASE)
if fenced:
parsed = json.loads(fenced.group(1))
return parsed if isinstance(parsed, dict) else None
except Exception:
pass
try:
# 3) first object-looking block
obj = re.search(r"(\{.*\})", text, flags=re.DOTALL)
if obj:
parsed = json.loads(obj.group(1))
return parsed if isinstance(parsed, dict) else None
except Exception:
pass
return None
def cast_to_dtype(value: Any, dtype: str) -> Any:
if value is None:
return None
try:
if dtype == "int":
return int(value)
if dtype == "float":
return float(value)
return str(value)
except Exception:
return None
def extract_structured(fields: Any, unstructured_text: str) -> dict | str:
"""
Extract structured data from unstructured text based on user-defined fields.
Args:
fields: A list of dicts or a pd.DataFrame with columns
[name, datatype, description].
unstructured_text: Raw text to extract data from.
Returns:
A JSON dict on success, or an error string.
"""
if not is_valid_text(unstructured_text):
return "Input text is empty. Please provide some text to extract from."
# Build schema from user-defined fields
normalized_fields = normalize_fields(fields)
schema_properties = {}
field_order = []
for f in normalized_fields:
field_name = f.get("name", "").strip()
if not field_name:
continue
if not field_name.isidentifier():
return f"Invalid field name '{field_name}'. Use letters, numbers, and underscores only."
field_type = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str")
schema_properties[field_name] = {
"type": field_type,
"description": f.get("description", ""),
}
field_order.append(field_name)
if not schema_properties:
return "Please add at least one valid field before extraction."
# Initialize LLM
llm = ChatGroq(
model="openai/gpt-oss-120b",
temperature=0,
api_key=os.getenv("GROQ_API_KEY"),
)
# Extract structured data
try:
schema_json = json.dumps(schema_properties, indent=2)
response = llm.invoke(
"Extract information from the text below.\n"
"Return ONLY one valid JSON object and no extra text.\n"
"Use exactly the fields in this schema.\n"
"If a value is missing, return null.\n\n"
f"Schema:\n{schema_json}\n\n"
f"Text:\n{unstructured_text}"
)
content = response.content if hasattr(response, "content") else str(response)
if isinstance(content, list):
content = "".join(
part.get("text", "") if isinstance(part, dict) else str(part)
for part in content
)
parsed = parse_json_from_text(str(content))
if not parsed:
return f"Could not parse JSON from model output: {content}"
# Coerce output to requested schema and order
cleaned = {}
for field_name in field_order:
dtype = schema_properties[field_name]["type"]
cleaned[field_name] = cast_to_dtype(parsed.get(field_name), dtype)
return cleaned
except Exception as e:
return f"Error during extraction: {str(e)}"
def render_styles():
st.markdown(
"""
<style>
.main-title {
font-size: 34px;
font-weight: 700;
margin-bottom: 4px;
}
.sub-title {
color: #6b7280;
margin-bottom: 20px;
}
.block-header {
font-size: 22px;
font-weight: 600;
margin: 8px 0 8px 0;
}
</style>
""",
unsafe_allow_html=True,
)
def main():
st.set_page_config(page_title="Dynamic Extraction", layout="wide")
render_styles()
st.markdown('<div class="main-title">Dynamic Invoice Extraction</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-title">Json structured output</div>', unsafe_allow_html=True)
if "fields_df" not in st.session_state:
st.session_state.fields_df = pd.DataFrame(DEFAULT_FIELDS)
if "generated_schema" not in st.session_state:
st.session_state.generated_schema = ""
if "structured_result" not in st.session_state:
st.session_state.structured_result = ""
if "structured_result_json" not in st.session_state:
st.session_state.structured_result_json = {}
left_col, right_col = st.columns(2)
with left_col:
st.markdown('<div class="block-header">Define Entities / Fields</div>', unsafe_allow_html=True)
if st.button("+ Add Field", width="stretch"):
st.session_state.fields_df = pd.concat(
[st.session_state.fields_df, pd.DataFrame([{"name": "", "datatype": "str", "description": ""}])],
ignore_index=True,
)
edited_df = st.data_editor(
st.session_state.fields_df,
width="stretch",
num_rows="dynamic",
column_config={
"name": st.column_config.TextColumn("name"),
"datatype": st.column_config.SelectboxColumn("datatype", options=["str", "int", "float"]),
"description": st.column_config.TextColumn("description"),
},
key="fields_editor",
)
st.session_state.fields_df = edited_df
st.markdown('<div class="block-header">Paste Unstructured Text</div>', unsafe_allow_html=True)
unstructured_text = st.text_area(
"Example: https://huggingface.co/spaces/opendatalab/MinerU",
"Click on the above link and extract the mqarkdown text from that page and paste it here...",
placeholder="Paste your text here...",
height=220,
)
if st.button("Extract Structured Data", type="primary", width="stretch"):
with st.spinner("Extracting structured data..."):
result = extract_structured(st.session_state.fields_df, unstructured_text)
if isinstance(result, dict):
st.session_state.structured_result_json = result
st.session_state.structured_result = ""
else:
st.session_state.structured_result_json = {}
st.session_state.structured_result = result
with right_col:
st.markdown("### Structured Output (Transposed Table)")
if st.session_state.structured_result_json:
transposed_df = (
pd.DataFrame([st.session_state.structured_result_json])
.T.reset_index()
.rename(columns={"index": "Field", 0: "Value"})
)
st.dataframe(transposed_df, width="stretch", hide_index=True)
elif st.session_state.structured_result:
st.error(st.session_state.structured_result)
else:
st.info("Run extraction to see transposed table output.")
if __name__ == "__main__":
main()