File size: 7,683 Bytes
9280c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecf1e4f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import re
from typing import Dict, List, Any

# Import keywords from separate files
from utils.genres_data import GENRES_KEYWORDS
from utils.moods_data import MOOD_KEYWORDS


def parse_user_query(query: str) -> Dict[str, Any]:
    """
    Parses a natural language user query to extract structured tags
    like genres, moods, target audience, era, decade, specific person, and media type preference.

    Args:
        query (str): The user's input query string.

    Returns:
        Dict[str, Any]: A dictionary containing extracted tags.
            Example: {
                "genres": ["sci-fi", "thriller"],
                "mood": ["suspenseful", "dark"],
                "target_audience": "adult", # or "children", "young_adult"
                "era": "modern", # or "classic", "contemporary"
                "decade": "90s", # e.g., "1990s" -> "90s"
                "specific_person": "Christopher Nolan", # author or director
                "media_type_preference": "book" # or "movie", or None
            }
    """
    query_lower = query.lower()
    parsed_tags: Dict[str, Any] = {
        "genres": [],
        "mood": [],
        "target_audience": None,
        "era": None,
        "decade": None,
        "specific_person": None,
        "media_type_preference": None,
        "raw_query": query  # Keep original query for debugging/explanation
    }

    # --- Media Type Preference (strong indicator) ---
    if re.search(r'\b(movie|film|picture|flick)s?\b', query_lower):
        parsed_tags["media_type_preference"] = "movie"
    if re.search(r'\b(book|novel|read|story)s?\b', query_lower):
        parsed_tags["media_type_preference"] = "book"

    # --- Genres ---
    for genre, keywords in GENRES_KEYWORDS.items():
        if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
            parsed_tags["genres"].append(genre)

    # Remove duplicates and normalize genres (e.g., 'young adult' as genre can be 'target_audience')
    parsed_tags["genres"] = list(set(parsed_tags["genres"]))

    # --- Moods / Tone ---
    for mood, keywords in MOOD_KEYWORDS.items():
        if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
            if mood not in parsed_tags["mood"]:
                parsed_tags["mood"].append(mood)
    parsed_tags["mood"] = list(set(parsed_tags["mood"]))

    # --- Target Audience ---
    if re.search(r'\b(children|kid|kids|child(?:ren\'s)?|younger audiences?|juvenile)\b', query_lower):
        parsed_tags["target_audience"] = "children"
        if "children" in parsed_tags["genres"]: parsed_tags["genres"].remove("children")
    elif re.search(r'\b(young adult|teen|teens|ya|adolescent)\b', query_lower):
        parsed_tags["target_audience"] = "young_adult"
        if "young adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("young adult")
    elif re.search(r'\b(adult|mature|grown-up|general audiences?)\b', query_lower):
        parsed_tags["target_audience"] = "adult"
        if "adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("adult")

    # --- Era ---
    if re.search(r'\b(classic|classical|old|vintage|timeless)\b', query_lower):
        parsed_tags["era"] = "classic"
    elif re.search(r'\b(contemporary|modern|recent|present-day|current)\b', query_lower):
        parsed_tags["era"] = "contemporary"
    elif re.search(r'\b(historical|period|past|ancient|medieval|victorian|retro)\b', query_lower):
        parsed_tags["era"] = "historical"
    elif re.search(r'\b(future|futuristic)\b', query_lower):
        parsed_tags["era"] = "future"


    # --- Decade ---
    decade_match = re.search(r'(\d{2}s|(\d{4})s)\b', query_lower)
    if decade_match:
        decade_str = decade_match.group(1)
        if len(decade_str) == 3:  # e.g., '90s'
            if decade_str.startswith('0'):
                parsed_tags["decade"] = "2000s"
            elif decade_str.startswith('10'):
                parsed_tags["decade"] = "2010s"
            elif decade_str.startswith('20'):
                 parsed_tags["decade"] = "2020s"
            else:
                parsed_tags["decade"] = f"19{decade_str}"
        elif len(decade_str) == 5:  # e.g., '1990s'
            parsed_tags["decade"] = decade_str
    # Explicitly check for "current decade"
    if re.search(r'\b(current|recent) decade\b', query_lower) or re.search(r'\b2020s\b', query_lower):
        parsed_tags["decade"] = "2020s"


    # --- Specific Person (Author/Director/Actor) ---
    person_patterns = [
        r'\bby\s+([a-zA-Z\s\.]+)\b',
        r'\b(?:directed\s+by|director)\s+([a-zA-Z\s\.]+)\b',
        r'\b(?:written\s+by|author)\s+([a-zA-Z\s\.]+)\b',
        r'\b(?:starring|featuring|with)\s+([a-zA-Z\s\.]+)\b',
        r'\b(?:from|like)\s+([a-zA-Z\s\.]+)s?\b'
    ]
    for pattern in person_patterns:
        person_match = re.search(pattern, query_lower)
        if person_match:
            person_name = person_match.group(1).strip()
            parsed_tags["specific_person"] = ' '.join([n.capitalize() for n in person_name.split()])
            break

    # Clean up genres: remove duplicates and ensure audience isn't duplicated
    parsed_tags["genres"] = list(set(parsed_tags["genres"]))
    if parsed_tags["target_audience"] == "young_adult" and "young adult" in parsed_tags["genres"]:
        parsed_tags["genres"].remove("young adult")
    if parsed_tags["target_audience"] == "children" and "children" in parsed_tags["genres"]:
        parsed_tags["genres"].remove("children")
    if parsed_tags["target_audience"] == "adult" and "adult" in parsed_tags["genres"]:
        parsed_tags["genres"].remove("adult")

    return parsed_tags


if __name__ == '__main__':
    # Test cases for demonstration
    queries = [
        "I want a heartwarming drama movie for young adults from the 90s.",
        "Recommend a thrilling sci-fi book by Isaac Asimov.",
        "A dark mystery by Agatha Christie.",
        "Show me action films for kids under 10.",
        "I need a romantic comedy released in the 2000s.",
        "Any classic historical fiction?",
        "looking for something uplifting for ages 18+",
        "A book about adventure for children.",
        "A suspenseful thriller for adults.",
        "A historical drama set in the 1800s.",
        "A funny animation from the 80s.",
        "A contemporary romance novel.",
        "A classic sci-fi movie directed by Stanley Kubrick.",
        "I want a thriller by Stephen King.",
        "A Japanese film like Akira Kurosawa's.",
        "I'm feeling sad, recommend a melancholic movie.",
        "Give me an exciting thriller movie.",
        "I'm in the mood for something lighthearted.",
        "Looking for a really dark and grim book.",
        "Need something joyful to watch.",
        "I need an uplifting and inspiring film.",
        "Show me a truly gloomy and depressing story.",
        "Find me a film that's both chaotic and funny.",
        "I want something thought-provoking and deep.",
        "Looking for a movie that's really tense and nerve-wracking.",
        "Something wistful and nostalgic.",
        "I'm feeling angry, show me something intense and violent.",
        "Recommend a bizarre and absurd book.",
        "A beautiful and poignant love story.",
        "I need a really witty comedy.",
        "Something raw and gritty.",
        "A grand, sweeping epic.",
        "Something that brings tears to my eyes.",
        "Find me a slow-paced, meditative film.",
        "A mind-bending psychological thriller."
    ]

    for q in queries:
        parsed = parse_user_query(q)
        print(f"Query: '{q}'")
        print(f"Parsed: {parsed}\n")