Spaces:

anly656
/

shopper

Sleeping

File size: 4,838 Bytes

45ecbbd

"""NLP Parser - Extract structured search parameters from natural language."""
import json
from huggingface_hub import InferenceClient
from config import HF_TOKEN, LLM_MODEL


def parse_user_request(text):
    """
    Parse natural language shopping request into structured parameters.
    
    Args:
        text: User's plain-English request
    
    Returns:
        dict: {
            "searches": [{"query": str, "max_price": float, ...}, ...],
            "requirements": [str, str, ...]
        }
    """
    if not text.strip():
        return {"searches": [], "requirements": []}
    
    system_prompt = """You are an expert shopping assistant parser. Given the user's natural language request, return JSON with two keys:

"searches": a list of objects, one per distinct product the user wants. Each object has:
  - query: search keywords (str, required)
  - category: one of [Electronics, Clothing & Apparel, Home & Garden, Health & Beauty, Sports & Outdoors, Toys & Games, Books & Media, Office & School, Food & Grocery, Auto & Parts] or null
  - min_price: number or null
  - max_price: number or null
  - sort_by: "relevance"|"price_low"|"price_high"|"rating"|null
  - brand: str or null
  - store: str or null

"requirements": a list of strings — specific criteria the user mentioned that go BEYOND standard filters. These are things you would need to read a product description or spec sheet to verify. Examples:
  - "espresso only — not drip or pour-over"
  - "manufactured in USA or Italy"
  - "burr grinder, not blade"
  - "BPA-free materials"
  - "compatible with K-cups"
  - "must have HDMI 2.1 port"
  - "vibration pump"
  - "water reservoir at least 1 liter"

Do NOT include price or brand here (those are already in the search object). Only include requirements that need spec-sheet verification.

Return ONLY valid JSON, no commentary."""

    user_message = f"User request: {text}"
    
    try:
        client = InferenceClient(token=HF_TOKEN)
        
        # Call the LLM
        response = client.chat_completion(
            model=LLM_MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            max_tokens=1000,
            temperature=0.3,
        )
        
        # Extract the response text
        response_text = response.choices[0].message.content.strip()
        
        # Try to extract JSON from the response
        parsed = _extract_json(response_text)
        
        # Validate and fill in missing fields
        validated = _validate_response(parsed)
        
        print(f"NLP Parser extracted: {json.dumps(validated, indent=2)}")
        return validated
        
    except Exception as e:
        print(f"Error in NLP parsing: {e}")
        import traceback
        traceback.print_exc()
        
        # Fallback: treat the text as a simple search query
        return {
            "searches": [{"query": text, "category": None, "min_price": None, 
                         "max_price": None, "sort_by": None, "brand": None, "store": None}],
            "requirements": []
        }


def _extract_json(text):
    """Extract JSON from LLM response that might have extra text."""
    # Try to find JSON block
    start = text.find('{')
    end = text.rfind('}') + 1
    
    if start != -1 and end > start:
        json_str = text[start:end]
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
    
    # If that fails, try the whole text
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        return {}


def _validate_response(data):
    """Validate and fill in missing fields."""
    if not isinstance(data, dict):
        return {"searches": [], "requirements": []}
    
    # Ensure searches is a list
    searches = data.get("searches", [])
    if not isinstance(searches, list):
        searches = []
    
    # Validate each search object
    validated_searches = []
    for search in searches:
        if not isinstance(search, dict):
            continue
        
        validated_searches.append({
            "query": search.get("query", ""),
            "category": search.get("category"),
            "min_price": search.get("min_price"),
            "max_price": search.get("max_price"),
            "sort_by": search.get("sort_by"),
            "brand": search.get("brand"),
            "store": search.get("store"),
        })
    
    # Ensure requirements is a list of strings
    requirements = data.get("requirements", [])
    if not isinstance(requirements, list):
        requirements = []
    
    requirements = [str(r) for r in requirements if r]
    
    return {
        "searches": validated_searches,
        "requirements": requirements
    }