shopper / nlp_parser.py
anly656's picture
Upload 5 files
45ecbbd verified
"""NLP Parser - Extract structured search parameters from natural language."""
import json
from huggingface_hub import InferenceClient
from config import HF_TOKEN, LLM_MODEL
def parse_user_request(text):
"""
Parse natural language shopping request into structured parameters.
Args:
text: User's plain-English request
Returns:
dict: {
"searches": [{"query": str, "max_price": float, ...}, ...],
"requirements": [str, str, ...]
}
"""
if not text.strip():
return {"searches": [], "requirements": []}
system_prompt = """You are an expert shopping assistant parser. Given the user's natural language request, return JSON with two keys:
"searches": a list of objects, one per distinct product the user wants. Each object has:
- query: search keywords (str, required)
- category: one of [Electronics, Clothing & Apparel, Home & Garden, Health & Beauty, Sports & Outdoors, Toys & Games, Books & Media, Office & School, Food & Grocery, Auto & Parts] or null
- min_price: number or null
- max_price: number or null
- sort_by: "relevance"|"price_low"|"price_high"|"rating"|null
- brand: str or null
- store: str or null
"requirements": a list of strings — specific criteria the user mentioned that go BEYOND standard filters. These are things you would need to read a product description or spec sheet to verify. Examples:
- "espresso only — not drip or pour-over"
- "manufactured in USA or Italy"
- "burr grinder, not blade"
- "BPA-free materials"
- "compatible with K-cups"
- "must have HDMI 2.1 port"
- "vibration pump"
- "water reservoir at least 1 liter"
Do NOT include price or brand here (those are already in the search object). Only include requirements that need spec-sheet verification.
Return ONLY valid JSON, no commentary."""
user_message = f"User request: {text}"
try:
client = InferenceClient(token=HF_TOKEN)
# Call the LLM
response = client.chat_completion(
model=LLM_MODEL,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
],
max_tokens=1000,
temperature=0.3,
)
# Extract the response text
response_text = response.choices[0].message.content.strip()
# Try to extract JSON from the response
parsed = _extract_json(response_text)
# Validate and fill in missing fields
validated = _validate_response(parsed)
print(f"NLP Parser extracted: {json.dumps(validated, indent=2)}")
return validated
except Exception as e:
print(f"Error in NLP parsing: {e}")
import traceback
traceback.print_exc()
# Fallback: treat the text as a simple search query
return {
"searches": [{"query": text, "category": None, "min_price": None,
"max_price": None, "sort_by": None, "brand": None, "store": None}],
"requirements": []
}
def _extract_json(text):
"""Extract JSON from LLM response that might have extra text."""
# Try to find JSON block
start = text.find('{')
end = text.rfind('}') + 1
if start != -1 and end > start:
json_str = text[start:end]
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
# If that fails, try the whole text
try:
return json.loads(text)
except json.JSONDecodeError:
return {}
def _validate_response(data):
"""Validate and fill in missing fields."""
if not isinstance(data, dict):
return {"searches": [], "requirements": []}
# Ensure searches is a list
searches = data.get("searches", [])
if not isinstance(searches, list):
searches = []
# Validate each search object
validated_searches = []
for search in searches:
if not isinstance(search, dict):
continue
validated_searches.append({
"query": search.get("query", ""),
"category": search.get("category"),
"min_price": search.get("min_price"),
"max_price": search.get("max_price"),
"sort_by": search.get("sort_by"),
"brand": search.get("brand"),
"store": search.get("store"),
})
# Ensure requirements is a list of strings
requirements = data.get("requirements", [])
if not isinstance(requirements, list):
requirements = []
requirements = [str(r) for r in requirements if r]
return {
"searches": validated_searches,
"requirements": requirements
}