GenAIDevTOProd commited on
Commit
8983b2d
·
verified ·
1 Parent(s): f8c143d

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. README.md +66 -8
  2. __init__.py +1 -0
  3. examples/test_slotmatch.py +21 -0
  4. extractor.py +53 -0
  5. pyproject.toml +3 -0
  6. schema.py +28 -0
  7. setup.py +21 -0
  8. utils.py +22 -0
README.md CHANGED
@@ -1,10 +1,68 @@
 
 
 
 
1
  ---
2
- title: Slotmatch
3
- emoji: 🏃
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: static
7
- pinned: false
8
- ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SlotMatch
2
+
3
+ **SlotMatch** is a lightweight Python package for extracting structured key-value pairs from unstructured or noisy LLM outputs. It supports regex-based parsing, fuzzy key recovery, schema validation, and confidence scoring. Perfect for production RAG, chatbot, and NLU pipelines.
4
+
5
  ---
 
 
 
 
 
 
 
6
 
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install slotmatch
11
+
12
+ ## Features
13
+
14
+ - Regex-based value extraction
15
+
16
+ - Fuzzy key mapping (e.g., intnt → intent)
17
+
18
+ - Schema validation for expected keys and types
19
+
20
+ - Type coercion (str, int, float, bool)
21
+
22
+ - Confidence scoring (regex = high, fuzzy = partial, fallback = 0)
23
+
24
+ - Lightweight, no external dependencies
25
+
26
+ ## Usage
27
+
28
+ from slotmatch import SlotExtractor
29
+
30
+ schema = {
31
+ "name": str,
32
+ "intent": str,
33
+ "destination": str
34
+ }
35
+
36
+ llm_output = '''
37
+ Hi, I'm Alice.
38
+ {
39
+ "intnt": "book_flight",
40
+ "dest": "NYC",
41
+ "name": "Alice"
42
+ }
43
+ '''
44
+
45
+ extractor = SlotExtractor(schema)
46
+ print(extractor.extract(llm_output))
47
+
48
+ ## Output
49
+
50
+ {
51
+ 'name': {'value': 'Alice', 'confidence': 1.0},
52
+ 'intent': {'value': 'book_flight', 'confidence': ~0.64},
53
+ 'destination': {'value': None, 'confidence': 0.0}
54
+ }
55
+
56
+ ## Example Use Cases
57
+
58
+ - Post-processing LLM outputs (chatbots, assistants, tools)
59
+
60
+ - Extracting form fields or user intents
61
+
62
+ - Structuring data for downstream APIs or storage
63
+
64
+ - Integrating LLMs with business logic (field validation, routing)
65
+
66
+ ## License
67
+
68
+ This project is licensed under the MIT License.
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .extractor import SlotExtractor
examples/test_slotmatch.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from slotmatch import SlotExtractor
2
+ from pprint import pprint
3
+
4
+ schema = {
5
+ "name": str,
6
+ "intent": str,
7
+ "destination": str
8
+ }
9
+
10
+ llm_output = '''
11
+ Hi, I’m Alice. I’d like to book a flight to New York.
12
+ {
13
+ "intnt": "book_flight",
14
+ "dest": "New York",
15
+ "name": "Alice"
16
+ }
17
+ '''
18
+
19
+ extractor = SlotExtractor(schema)
20
+ result = extractor.extract(llm_output)
21
+ pprint(result)
extractor.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from slotmatch.schema import SchemaValidator
2
+ from slotmatch.utils import extract_value_by_regex, fuzzy_match_key, compute_confidence
3
+
4
+ class SlotExtractor:
5
+ def __init__(self, schema: dict):
6
+ self.validator = SchemaValidator(schema)
7
+ self.schema = self.validator.get_schema()
8
+ self.schema_keys = list(self.schema.keys())
9
+
10
+ def extract(self, text: str) -> dict:
11
+ result = {}
12
+
13
+ for expected_key in self.schema_keys:
14
+ # 1. Try regex directly
15
+ raw_value = extract_value_by_regex(text, expected_key)
16
+ if raw_value is not None:
17
+ result[expected_key] = {
18
+ "value": self._coerce_type(raw_value, self.schema[expected_key]),
19
+ "confidence": compute_confidence("regex")
20
+ }
21
+ continue
22
+
23
+ # 2. Try fuzzy match
24
+ fuzzy_key, score = fuzzy_match_key(expected_key, self._get_all_keys_from_text(text))
25
+ if fuzzy_key:
26
+ raw_value = extract_value_by_regex(text, fuzzy_key)
27
+ if raw_value is not None:
28
+ result[expected_key] = {
29
+ "value": self._coerce_type(raw_value, self.schema[expected_key]),
30
+ "confidence": compute_confidence("fuzzy") * score
31
+ }
32
+ continue
33
+
34
+ # 3. Fallback
35
+ result[expected_key] = {
36
+ "value": None,
37
+ "confidence": 0.0
38
+ }
39
+
40
+ return result
41
+
42
+ def _get_all_keys_from_text(self, text: str) -> list:
43
+ import re
44
+ pattern = r'["\']?([\w-]+)["\']?\s*[:=]'
45
+ return list(set(re.findall(pattern, text)))
46
+
47
+ def _coerce_type(self, value, expected_type):
48
+ try:
49
+ if expected_type == bool:
50
+ return value.lower() in ['true', 'yes', '1']
51
+ return expected_type(value)
52
+ except:
53
+ return value # fallback to original
pyproject.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
schema.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # slotmatch/schema.py
2
+
3
+ ALLOWED_TYPES = {str, int, float, bool}
4
+
5
+ class SchemaValidationError(Exception):
6
+ """Custom error when schema is invalid."""
7
+ pass
8
+
9
+ class SchemaValidator:
10
+ def __init__(self, schema: dict):
11
+ self.schema = schema
12
+ self._validate_schema()
13
+
14
+ def _validate_schema(self):
15
+ if not isinstance(self.schema, dict):
16
+ raise SchemaValidationError("Schema must be a dictionary.")
17
+
18
+ for key, expected_type in self.schema.items():
19
+ if not isinstance(key, str):
20
+ raise SchemaValidationError(f"Key '{key}' must be a string.")
21
+ if expected_type not in ALLOWED_TYPES:
22
+ raise SchemaValidationError(
23
+ f"Unsupported type '{expected_type}' for key '{key}'. "
24
+ f"Supported types: {ALLOWED_TYPES}"
25
+ )
26
+
27
+ def get_schema(self):
28
+ return self.schema
setup.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='slotmatch',
5
+ version='0.1.0',
6
+ author='Your Name',
7
+ author_email='your.email@example.com',
8
+ description='Extract structured key-value pairs from unstructured LLM output using regex, fuzzy matching, and schema validation.',
9
+ long_description=open("README.md").read(),
10
+ long_description_content_type='text/markdown',
11
+ url='https://huggingface.co/GenAIDevTOProd',
12
+ packages=find_packages(),
13
+ include_package_data=True,
14
+ python_requires='>=3.7',
15
+ classifiers=[
16
+ 'Programming Language :: Python :: 3',
17
+ 'License :: OSI Approved :: MIT License',
18
+ 'Operating System :: OS Independent',
19
+ ],
20
+ install_requires=[],
21
+ )
utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import difflib
3
+
4
+ def extract_value_by_regex(text: str, key: str):
5
+ pattern = rf'["\']?{key}["\']?\s*[:=]\s*["\']([^"\']+)["\']'
6
+ match = re.search(pattern, text)
7
+ if match:
8
+ return match.group(1)
9
+ return None
10
+
11
+ def fuzzy_match_key(input_key: str, schema_keys: list, cutoff: float = 0.75):
12
+ best_match = difflib.get_close_matches(input_key, schema_keys, n=1, cutoff=cutoff)
13
+ if best_match:
14
+ return best_match[0], difflib.SequenceMatcher(None, input_key, best_match[0]).ratio()
15
+ return None, 0.0
16
+
17
+ def compute_confidence(method: str):
18
+ return {
19
+ "regex": 1.0,
20
+ "fuzzy": 0.7,
21
+ "semantic": 0.5
22
+ }.get(method, 0.0)