Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +66 -8
- __init__.py +1 -0
- examples/test_slotmatch.py +21 -0
- extractor.py +53 -0
- pyproject.toml +3 -0
- schema.py +28 -0
- setup.py +21 -0
- utils.py +22 -0
README.md
CHANGED
|
@@ -1,10 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
title: Slotmatch
|
| 3 |
-
emoji: 🏃
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: yellow
|
| 6 |
-
sdk: static
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SlotMatch
|
| 2 |
+
|
| 3 |
+
**SlotMatch** is a lightweight Python package for extracting structured key-value pairs from unstructured or noisy LLM outputs. It supports regex-based parsing, fuzzy key recovery, schema validation, and confidence scoring. Perfect for production RAG, chatbot, and NLU pipelines.
|
| 4 |
+
|
| 5 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
## Installation
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
pip install slotmatch
|
| 11 |
+
|
| 12 |
+
## Features
|
| 13 |
+
|
| 14 |
+
- Regex-based value extraction
|
| 15 |
+
|
| 16 |
+
- Fuzzy key mapping (e.g., intnt → intent)
|
| 17 |
+
|
| 18 |
+
- Schema validation for expected keys and types
|
| 19 |
+
|
| 20 |
+
- Type coercion (str, int, float, bool)
|
| 21 |
+
|
| 22 |
+
- Confidence scoring (regex = high, fuzzy = partial, fallback = 0)
|
| 23 |
+
|
| 24 |
+
- Lightweight, no external dependencies
|
| 25 |
+
|
| 26 |
+
## Usage
|
| 27 |
+
|
| 28 |
+
from slotmatch import SlotExtractor
|
| 29 |
+
|
| 30 |
+
schema = {
|
| 31 |
+
"name": str,
|
| 32 |
+
"intent": str,
|
| 33 |
+
"destination": str
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
llm_output = '''
|
| 37 |
+
Hi, I'm Alice.
|
| 38 |
+
{
|
| 39 |
+
"intnt": "book_flight",
|
| 40 |
+
"dest": "NYC",
|
| 41 |
+
"name": "Alice"
|
| 42 |
+
}
|
| 43 |
+
'''
|
| 44 |
+
|
| 45 |
+
extractor = SlotExtractor(schema)
|
| 46 |
+
print(extractor.extract(llm_output))
|
| 47 |
+
|
| 48 |
+
## Output
|
| 49 |
+
|
| 50 |
+
{
|
| 51 |
+
'name': {'value': 'Alice', 'confidence': 1.0},
|
| 52 |
+
'intent': {'value': 'book_flight', 'confidence': ~0.64},
|
| 53 |
+
'destination': {'value': None, 'confidence': 0.0}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
## Example Use Cases
|
| 57 |
+
|
| 58 |
+
- Post-processing LLM outputs (chatbots, assistants, tools)
|
| 59 |
+
|
| 60 |
+
- Extracting form fields or user intents
|
| 61 |
+
|
| 62 |
+
- Structuring data for downstream APIs or storage
|
| 63 |
+
|
| 64 |
+
- Integrating LLMs with business logic (field validation, routing)
|
| 65 |
+
|
| 66 |
+
## License
|
| 67 |
+
|
| 68 |
+
This project is licensed under the MIT License.
|
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .extractor import SlotExtractor
|
examples/test_slotmatch.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from slotmatch import SlotExtractor
|
| 2 |
+
from pprint import pprint
|
| 3 |
+
|
| 4 |
+
schema = {
|
| 5 |
+
"name": str,
|
| 6 |
+
"intent": str,
|
| 7 |
+
"destination": str
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
llm_output = '''
|
| 11 |
+
Hi, I’m Alice. I’d like to book a flight to New York.
|
| 12 |
+
{
|
| 13 |
+
"intnt": "book_flight",
|
| 14 |
+
"dest": "New York",
|
| 15 |
+
"name": "Alice"
|
| 16 |
+
}
|
| 17 |
+
'''
|
| 18 |
+
|
| 19 |
+
extractor = SlotExtractor(schema)
|
| 20 |
+
result = extractor.extract(llm_output)
|
| 21 |
+
pprint(result)
|
extractor.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from slotmatch.schema import SchemaValidator
|
| 2 |
+
from slotmatch.utils import extract_value_by_regex, fuzzy_match_key, compute_confidence
|
| 3 |
+
|
| 4 |
+
class SlotExtractor:
|
| 5 |
+
def __init__(self, schema: dict):
|
| 6 |
+
self.validator = SchemaValidator(schema)
|
| 7 |
+
self.schema = self.validator.get_schema()
|
| 8 |
+
self.schema_keys = list(self.schema.keys())
|
| 9 |
+
|
| 10 |
+
def extract(self, text: str) -> dict:
|
| 11 |
+
result = {}
|
| 12 |
+
|
| 13 |
+
for expected_key in self.schema_keys:
|
| 14 |
+
# 1. Try regex directly
|
| 15 |
+
raw_value = extract_value_by_regex(text, expected_key)
|
| 16 |
+
if raw_value is not None:
|
| 17 |
+
result[expected_key] = {
|
| 18 |
+
"value": self._coerce_type(raw_value, self.schema[expected_key]),
|
| 19 |
+
"confidence": compute_confidence("regex")
|
| 20 |
+
}
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
# 2. Try fuzzy match
|
| 24 |
+
fuzzy_key, score = fuzzy_match_key(expected_key, self._get_all_keys_from_text(text))
|
| 25 |
+
if fuzzy_key:
|
| 26 |
+
raw_value = extract_value_by_regex(text, fuzzy_key)
|
| 27 |
+
if raw_value is not None:
|
| 28 |
+
result[expected_key] = {
|
| 29 |
+
"value": self._coerce_type(raw_value, self.schema[expected_key]),
|
| 30 |
+
"confidence": compute_confidence("fuzzy") * score
|
| 31 |
+
}
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
# 3. Fallback
|
| 35 |
+
result[expected_key] = {
|
| 36 |
+
"value": None,
|
| 37 |
+
"confidence": 0.0
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
return result
|
| 41 |
+
|
| 42 |
+
def _get_all_keys_from_text(self, text: str) -> list:
|
| 43 |
+
import re
|
| 44 |
+
pattern = r'["\']?([\w-]+)["\']?\s*[:=]'
|
| 45 |
+
return list(set(re.findall(pattern, text)))
|
| 46 |
+
|
| 47 |
+
def _coerce_type(self, value, expected_type):
|
| 48 |
+
try:
|
| 49 |
+
if expected_type == bool:
|
| 50 |
+
return value.lower() in ['true', 'yes', '1']
|
| 51 |
+
return expected_type(value)
|
| 52 |
+
except:
|
| 53 |
+
return value # fallback to original
|
pyproject.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
schema.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# slotmatch/schema.py
|
| 2 |
+
|
| 3 |
+
ALLOWED_TYPES = {str, int, float, bool}
|
| 4 |
+
|
| 5 |
+
class SchemaValidationError(Exception):
|
| 6 |
+
"""Custom error when schema is invalid."""
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
class SchemaValidator:
|
| 10 |
+
def __init__(self, schema: dict):
|
| 11 |
+
self.schema = schema
|
| 12 |
+
self._validate_schema()
|
| 13 |
+
|
| 14 |
+
def _validate_schema(self):
|
| 15 |
+
if not isinstance(self.schema, dict):
|
| 16 |
+
raise SchemaValidationError("Schema must be a dictionary.")
|
| 17 |
+
|
| 18 |
+
for key, expected_type in self.schema.items():
|
| 19 |
+
if not isinstance(key, str):
|
| 20 |
+
raise SchemaValidationError(f"Key '{key}' must be a string.")
|
| 21 |
+
if expected_type not in ALLOWED_TYPES:
|
| 22 |
+
raise SchemaValidationError(
|
| 23 |
+
f"Unsupported type '{expected_type}' for key '{key}'. "
|
| 24 |
+
f"Supported types: {ALLOWED_TYPES}"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def get_schema(self):
|
| 28 |
+
return self.schema
|
setup.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name='slotmatch',
|
| 5 |
+
version='0.1.0',
|
| 6 |
+
author='Your Name',
|
| 7 |
+
author_email='your.email@example.com',
|
| 8 |
+
description='Extract structured key-value pairs from unstructured LLM output using regex, fuzzy matching, and schema validation.',
|
| 9 |
+
long_description=open("README.md").read(),
|
| 10 |
+
long_description_content_type='text/markdown',
|
| 11 |
+
url='https://huggingface.co/GenAIDevTOProd',
|
| 12 |
+
packages=find_packages(),
|
| 13 |
+
include_package_data=True,
|
| 14 |
+
python_requires='>=3.7',
|
| 15 |
+
classifiers=[
|
| 16 |
+
'Programming Language :: Python :: 3',
|
| 17 |
+
'License :: OSI Approved :: MIT License',
|
| 18 |
+
'Operating System :: OS Independent',
|
| 19 |
+
],
|
| 20 |
+
install_requires=[],
|
| 21 |
+
)
|
utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import difflib
|
| 3 |
+
|
| 4 |
+
def extract_value_by_regex(text: str, key: str):
|
| 5 |
+
pattern = rf'["\']?{key}["\']?\s*[:=]\s*["\']([^"\']+)["\']'
|
| 6 |
+
match = re.search(pattern, text)
|
| 7 |
+
if match:
|
| 8 |
+
return match.group(1)
|
| 9 |
+
return None
|
| 10 |
+
|
| 11 |
+
def fuzzy_match_key(input_key: str, schema_keys: list, cutoff: float = 0.75):
|
| 12 |
+
best_match = difflib.get_close_matches(input_key, schema_keys, n=1, cutoff=cutoff)
|
| 13 |
+
if best_match:
|
| 14 |
+
return best_match[0], difflib.SequenceMatcher(None, input_key, best_match[0]).ratio()
|
| 15 |
+
return None, 0.0
|
| 16 |
+
|
| 17 |
+
def compute_confidence(method: str):
|
| 18 |
+
return {
|
| 19 |
+
"regex": 1.0,
|
| 20 |
+
"fuzzy": 0.7,
|
| 21 |
+
"semantic": 0.5
|
| 22 |
+
}.get(method, 0.0)
|