Spaces:

jarvisx17
/

Structured_Extraction

Sleeping

App Files Files Community

jarvisx17 commited on Feb 19

Commit

d91bc32

verified ·

1 Parent(s): 49d1a8b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +289 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,291 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import json
+import re
 import streamlit as st
+import pandas as pd
+from typing import Any, List
+from langchain_groq import ChatGroq
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# --- 1. Config ---
+DEFAULT_FIELDS = [{"name": "number", "datatype": "int", "description": "Description of the item"}]
+TYPE_MAPPING_STR = {"int": "int", "float": "float", "str": "str"}
+def normalize_fields(fields: Any) -> List[dict]:
+    """Convert DataFrame/list input into a clean list of field dicts."""
+    try:
+        if isinstance(fields, pd.DataFrame):
+            parsed = fields.fillna("").to_dict(orient="records")
+        elif isinstance(fields, list):
+            parsed = fields
+        else:
+            return []
+        cleaned = []
+        for item in parsed:
+            if not isinstance(item, dict):
+                continue
+            cleaned.append(
+                {
+                    "name": str(item.get("name", "")).strip(),
+                    "datatype": str(item.get("datatype", "str")).strip() or "str",
+                    "description": str(item.get("description", "")).strip(),
+                }
+            )
+        return cleaned
+    except Exception:
+        return []
+def generate_schema_json(fields: Any) -> str:
+    """Generate JSON schema-like object from field rows."""
+    normalized_fields = normalize_fields(fields)
+    properties = {}
+    required = []
+    for f in normalized_fields:
+        field_name = f.get("name", "").strip()
+        if not field_name:
+            continue
+        dtype = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str")
+        properties[field_name] = {
+            "type": dtype,
+            "description": f.get("description", ""),
+            "nullable": True,
+        }
+        required.append(field_name)
+    schema = {
+        "type": "object",
+        "properties": properties,
+        "required": required,
+        "additionalProperties": False,
+    }
+    return json.dumps(schema, indent=2)
+def is_valid_text(text: str) -> bool:
+    """Guardrail: reject empty or whitespace-only input."""
+    return bool((text or "").strip())
+def parse_json_from_text(text: str) -> dict | None:
+    """Extract JSON object from model response text."""
+    try:
+        # 1) direct JSON
+        parsed = json.loads(text)
+        return parsed if isinstance(parsed, dict) else None
+    except Exception:
+        pass
+    try:
+        # 2) fenced code block
+        fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL | re.IGNORECASE)
+        if fenced:
+            parsed = json.loads(fenced.group(1))
+            return parsed if isinstance(parsed, dict) else None
+    except Exception:
+        pass
+    try:
+        # 3) first object-looking block
+        obj = re.search(r"(\{.*\})", text, flags=re.DOTALL)
+        if obj:
+            parsed = json.loads(obj.group(1))
+            return parsed if isinstance(parsed, dict) else None
+    except Exception:
+        pass
+    return None
+def cast_to_dtype(value: Any, dtype: str) -> Any:
+    if value is None:
+        return None
+    try:
+        if dtype == "int":
+            return int(value)
+        if dtype == "float":
+            return float(value)
+        return str(value)
+    except Exception:
+        return None
+def extract_structured(fields: Any, unstructured_text: str) -> dict | str:
+    """
+    Extract structured data from unstructured text based on user-defined fields.
+    Args:
+        fields:            A list of dicts or a pd.DataFrame with columns
+                           [name, datatype, description].
+        unstructured_text: Raw text to extract data from.
+    Returns:
+        A JSON dict on success, or an error string.
+    """
+    if not is_valid_text(unstructured_text):
+        return "Input text is empty. Please provide some text to extract from."
+    # Build schema from user-defined fields
+    normalized_fields = normalize_fields(fields)
+    schema_properties = {}
+    field_order = []
+    for f in normalized_fields:
+        field_name = f.get("name", "").strip()
+        if not field_name:
+            continue
+        if not field_name.isidentifier():
+            return f"Invalid field name '{field_name}'. Use letters, numbers, and underscores only."
+        field_type = TYPE_MAPPING_STR.get(f.get("datatype", "str"), "str")
+        schema_properties[field_name] = {
+            "type": field_type,
+            "description": f.get("description", ""),
+        }
+        field_order.append(field_name)
+    if not schema_properties:
+        return "Please add at least one valid field before extraction."
+    # Initialize LLM
+    llm = ChatGroq(
+        model="openai/gpt-oss-120b",
+        temperature=0,
+        api_key=os.getenv("GROQ_API_KEY"),
+    )
+    # Extract structured data
+    try:
+        schema_json = json.dumps(schema_properties, indent=2)
+        response = llm.invoke(
+            "Extract information from the text below.\n"
+            "Return ONLY one valid JSON object and no extra text.\n"
+            "Use exactly the fields in this schema.\n"
+            "If a value is missing, return null.\n\n"
+            f"Schema:\n{schema_json}\n\n"
+            f"Text:\n{unstructured_text}"
+        )
+        content = response.content if hasattr(response, "content") else str(response)
+        if isinstance(content, list):
+            content = "".join(
+                part.get("text", "") if isinstance(part, dict) else str(part)
+                for part in content
+            )
+        parsed = parse_json_from_text(str(content))
+        if not parsed:
+            return f"Could not parse JSON from model output: {content}"
+        # Coerce output to requested schema and order
+        cleaned = {}
+        for field_name in field_order:
+            dtype = schema_properties[field_name]["type"]
+            cleaned[field_name] = cast_to_dtype(parsed.get(field_name), dtype)
+        return cleaned
+    except Exception as e:
+        return f"Error during extraction: {str(e)}"
+def render_styles():
+    st.markdown(
+        """
+<style>
+.main-title {
+  font-size: 34px;
+  font-weight: 700;
+  margin-bottom: 4px;
+}
+.sub-title {
+  color: #6b7280;
+  margin-bottom: 20px;
+}
+.block-header {
+  font-size: 22px;
+  font-weight: 600;
+  margin: 8px 0 8px 0;
+}
+</style>
+        """,
+        unsafe_allow_html=True,
+    )
+def main():
+    st.set_page_config(page_title="Dynamic Extraction", layout="wide")
+    render_styles()
+    st.markdown('<div class="main-title">Dynamic Invoice Extraction</div>', unsafe_allow_html=True)
+    st.markdown('<div class="sub-title">Json structured output</div>', unsafe_allow_html=True)
+    if "fields_df" not in st.session_state:
+        st.session_state.fields_df = pd.DataFrame(DEFAULT_FIELDS)
+    if "generated_schema" not in st.session_state:
+        st.session_state.generated_schema = ""
+    if "structured_result" not in st.session_state:
+        st.session_state.structured_result = ""
+    if "structured_result_json" not in st.session_state:
+        st.session_state.structured_result_json = {}
+    left_col, right_col = st.columns(2)
+    with left_col:
+        st.markdown('<div class="block-header">Define Entities / Fields</div>', unsafe_allow_html=True)
+        if st.button("+ Add Field", width="stretch"):
+            st.session_state.fields_df = pd.concat(
+                [st.session_state.fields_df, pd.DataFrame([{"name": "", "datatype": "str", "description": ""}])],
+                ignore_index=True,
+            )
+        edited_df = st.data_editor(
+            st.session_state.fields_df,
+            width="stretch",
+            num_rows="dynamic",
+            column_config={
+                "name": st.column_config.TextColumn("name"),
+                "datatype": st.column_config.SelectboxColumn("datatype", options=["str", "int", "float"]),
+                "description": st.column_config.TextColumn("description"),
+            },
+            key="fields_editor",
+        )
+        st.session_state.fields_df = edited_df
+        st.markdown('<div class="block-header">Paste Unstructured Text</div>', unsafe_allow_html=True)
+        unstructured_text = st.text_area(
+            "Example: https://huggingface.co/spaces/opendatalab/MinerU",
+            "Click on the above link and extract the mqarkdown text from that page and paste it here...",
+            placeholder="Paste your text here...",
+            height=220,
+        )
+        if st.button("Extract Structured Data", type="primary", width="stretch"):
+            with st.spinner("Extracting structured data..."):
+                result = extract_structured(st.session_state.fields_df, unstructured_text)
+                if isinstance(result, dict):
+                    st.session_state.structured_result_json = result
+                    st.session_state.structured_result = ""
+                else:
+                    st.session_state.structured_result_json = {}
+                    st.session_state.structured_result = result
+    with right_col:
+        st.markdown("### Structured Output (Transposed Table)")
+        if st.session_state.structured_result_json:
+            transposed_df = (
+                pd.DataFrame([st.session_state.structured_result_json])
+                .T.reset_index()
+                .rename(columns={"index": "Field", 0: "Value"})
+            )
+            st.dataframe(transposed_df, width="stretch", hide_index=True)
+        elif st.session_state.structured_result:
+            st.error(st.session_state.structured_result)
+        else:
+            st.info("Run extraction to see transposed table output.")
+if __name__ == "__main__":
+    main()