badminton001 commited on
Commit
9280c07
·
verified ·
1 Parent(s): ec01993

Update utils/query_parser.py

Browse files
Files changed (1) hide show
  1. utils/query_parser.py +177 -178
utils/query_parser.py CHANGED
@@ -1,178 +1,177 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import re
5
- from typing import Dict, List, Any
6
-
7
- # Import keywords from separate files
8
- from utils.genres_data import GENRES_KEYWORDS
9
- from utils.moods_data import MOOD_KEYWORDS
10
-
11
-
12
- def parse_user_query(query: str) -> Dict[str, Any]:
13
- """
14
- Parses a natural language user query to extract structured tags
15
- like genres, moods, target audience, era, decade, specific person, and media type preference.
16
-
17
- Args:
18
- query (str): The user's input query string.
19
-
20
- Returns:
21
- Dict[str, Any]: A dictionary containing extracted tags.
22
- Example: {
23
- "genres": ["sci-fi", "thriller"],
24
- "mood": ["suspenseful", "dark"],
25
- "target_audience": "adult", # or "children", "young_adult"
26
- "era": "modern", # or "classic", "contemporary"
27
- "decade": "90s", # e.g., "1990s" -> "90s"
28
- "specific_person": "Christopher Nolan", # author or director
29
- "media_type_preference": "book" # or "movie", or None
30
- }
31
- """
32
- query_lower = query.lower()
33
- parsed_tags: Dict[str, Any] = {
34
- "genres": [],
35
- "mood": [],
36
- "target_audience": None,
37
- "era": None,
38
- "decade": None,
39
- "specific_person": None,
40
- "media_type_preference": None,
41
- "raw_query": query # Keep original query for debugging/explanation
42
- }
43
-
44
- # --- Media Type Preference (strong indicator) ---
45
- if re.search(r'\b(movie|film|picture|flick)s?\b', query_lower):
46
- parsed_tags["media_type_preference"] = "movie"
47
- if re.search(r'\b(book|novel|read|story)s?\b', query_lower):
48
- parsed_tags["media_type_preference"] = "book"
49
-
50
- # --- Genres ---
51
- for genre, keywords in GENRES_KEYWORDS.items():
52
- if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
53
- parsed_tags["genres"].append(genre)
54
-
55
- # Remove duplicates and normalize genres (e.g., 'young adult' as genre can be 'target_audience')
56
- parsed_tags["genres"] = list(set(parsed_tags["genres"]))
57
-
58
- # --- Moods / Tone ---
59
- for mood, keywords in MOOD_KEYWORDS.items():
60
- if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
61
- if mood not in parsed_tags["mood"]:
62
- parsed_tags["mood"].append(mood)
63
- parsed_tags["mood"] = list(set(parsed_tags["mood"]))
64
-
65
- # --- Target Audience ---
66
- if re.search(r'\b(children|kid|kids|child(?:ren\'s)?|younger audiences?|juvenile)\b', query_lower):
67
- parsed_tags["target_audience"] = "children"
68
- if "children" in parsed_tags["genres"]: parsed_tags["genres"].remove("children")
69
- elif re.search(r'\b(young adult|teen|teens|ya|adolescent)\b', query_lower):
70
- parsed_tags["target_audience"] = "young_adult"
71
- if "young adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("young adult")
72
- elif re.search(r'\b(adult|mature|grown-up|general audiences?)\b', query_lower):
73
- parsed_tags["target_audience"] = "adult"
74
- if "adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("adult")
75
-
76
- # --- Era ---
77
- if re.search(r'\b(classic|classical|old|vintage|timeless)\b', query_lower):
78
- parsed_tags["era"] = "classic"
79
- elif re.search(r'\b(contemporary|modern|recent|present-day|current)\b', query_lower):
80
- parsed_tags["era"] = "contemporary"
81
- elif re.search(r'\b(historical|period|past|ancient|medieval|victorian|retro)\b', query_lower):
82
- parsed_tags["era"] = "historical"
83
- elif re.search(r'\b(future|futuristic)\b', query_lower):
84
- parsed_tags["era"] = "future"
85
-
86
-
87
- # --- Decade ---
88
- decade_match = re.search(r'(\d{2}s|(\d{4})s)\b', query_lower)
89
- if decade_match:
90
- decade_str = decade_match.group(1)
91
- if len(decade_str) == 3: # e.g., '90s'
92
- if decade_str.startswith('0'):
93
- parsed_tags["decade"] = "2000s"
94
- elif decade_str.startswith('10'):
95
- parsed_tags["decade"] = "2010s"
96
- elif decade_str.startswith('20'):
97
- parsed_tags["decade"] = "2020s"
98
- else:
99
- parsed_tags["decade"] = f"19{decade_str}"
100
- elif len(decade_str) == 5: # e.g., '1990s'
101
- parsed_tags["decade"] = decade_str
102
- # Explicitly check for "current decade"
103
- if re.search(r'\b(current|recent) decade\b', query_lower) or re.search(r'\b2020s\b', query_lower):
104
- parsed_tags["decade"] = "2020s"
105
-
106
-
107
- # --- Specific Person (Author/Director/Actor) ---
108
- person_patterns = [
109
- r'\bby\s+([a-zA-Z\s\.]+)\b',
110
- r'\b(?:directed\s+by|director)\s+([a-zA-Z\s\.]+)\b',
111
- r'\b(?:written\s+by|author)\s+([a-zA-Z\s\.]+)\b',
112
- r'\b(?:starring|featuring|with)\s+([a-zA-Z\s\.]+)\b',
113
- r'\b(?:from|like)\s+([a-zA-Z\s\.]+)s?\b'
114
- ]
115
- for pattern in person_patterns:
116
- person_match = re.search(pattern, query_lower)
117
- if person_match:
118
- person_name = person_match.group(1).strip()
119
- parsed_tags["specific_person"] = ' '.join([n.capitalize() for n in person_name.split()])
120
- break
121
-
122
- # Clean up genres: remove duplicates and ensure audience isn't duplicated
123
- parsed_tags["genres"] = list(set(parsed_tags["genres"]))
124
- if parsed_tags["target_audience"] == "young_adult" and "young adult" in parsed_tags["genres"]:
125
- parsed_tags["genres"].remove("young adult")
126
- if parsed_tags["target_audience"] == "children" and "children" in parsed_tags["genres"]:
127
- parsed_tags["genres"].remove("children")
128
- if parsed_tags["target_audience"] == "adult" and "adult" in parsed_tags["genres"]:
129
- parsed_tags["genres"].remove("adult")
130
-
131
- return parsed_tags
132
-
133
-
134
- if __name__ == '__main__':
135
- # Test cases for demonstration
136
- queries = [
137
- "I want a heartwarming drama movie for young adults from the 90s.",
138
- "Recommend a thrilling sci-fi book by Isaac Asimov.",
139
- "A dark mystery by Agatha Christie.",
140
- "Show me action films for kids under 10.",
141
- "I need a romantic comedy released in the 2000s.",
142
- "Any classic historical fiction?",
143
- "looking for something uplifting for ages 18+",
144
- "What about a movie directed by Christopher Nolan?",
145
- "A book about adventure for children.",
146
- "A suspenseful thriller for adults.",
147
- "A historical drama set in the 1800s.",
148
- "A funny animation from the 80s.",
149
- "A contemporary romance novel.",
150
- "A classic sci-fi movie directed by Stanley Kubrick.",
151
- "I want a thriller by Stephen King.",
152
- "A Japanese film like Akira Kurosawa's.",
153
- "I'm feeling sad, recommend a melancholic movie.",
154
- "Give me an exciting thriller movie.",
155
- "I'm in the mood for something lighthearted.",
156
- "Looking for a really dark and grim book.",
157
- "Need something joyful to watch.",
158
- "I need an uplifting and inspiring film.",
159
- "Show me a truly gloomy and depressing story.",
160
- "Find me a film that's both chaotic and funny.",
161
- "I want something thought-provoking and deep.",
162
- "Looking for a movie that's really tense and nerve-wracking.",
163
- "Something wistful and nostalgic.",
164
- "I'm feeling angry, show me something intense and violent.",
165
- "Recommend a bizarre and absurd book.",
166
- "A beautiful and poignant love story.",
167
- "I need a really witty comedy.",
168
- "Something raw and gritty.",
169
- "A grand, sweeping epic.",
170
- "Something that brings tears to my eyes.",
171
- "Find me a slow-paced, meditative film.",
172
- "A mind-bending psychological thriller."
173
- ]
174
-
175
- for q in queries:
176
- parsed = parse_user_query(q)
177
- print(f"Query: '{q}'")
178
- print(f"Parsed: {parsed}\n")
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ from typing import Dict, List, Any
6
+
7
+ # Import keywords from separate files
8
+ from utils.genres_data import GENRES_KEYWORDS
9
+ from utils.moods_data import MOOD_KEYWORDS
10
+
11
+
12
+ def parse_user_query(query: str) -> Dict[str, Any]:
13
+ """
14
+ Parses a natural language user query to extract structured tags
15
+ like genres, moods, target audience, era, decade, specific person, and media type preference.
16
+
17
+ Args:
18
+ query (str): The user's input query string.
19
+
20
+ Returns:
21
+ Dict[str, Any]: A dictionary containing extracted tags.
22
+ Example: {
23
+ "genres": ["sci-fi", "thriller"],
24
+ "mood": ["suspenseful", "dark"],
25
+ "target_audience": "adult", # or "children", "young_adult"
26
+ "era": "modern", # or "classic", "contemporary"
27
+ "decade": "90s", # e.g., "1990s" -> "90s"
28
+ "specific_person": "Christopher Nolan", # author or director
29
+ "media_type_preference": "book" # or "movie", or None
30
+ }
31
+ """
32
+ query_lower = query.lower()
33
+ parsed_tags: Dict[str, Any] = {
34
+ "genres": [],
35
+ "mood": [],
36
+ "target_audience": None,
37
+ "era": None,
38
+ "decade": None,
39
+ "specific_person": None,
40
+ "media_type_preference": None,
41
+ "raw_query": query # Keep original query for debugging/explanation
42
+ }
43
+
44
+ # --- Media Type Preference (strong indicator) ---
45
+ if re.search(r'\b(movie|film|picture|flick)s?\b', query_lower):
46
+ parsed_tags["media_type_preference"] = "movie"
47
+ if re.search(r'\b(book|novel|read|story)s?\b', query_lower):
48
+ parsed_tags["media_type_preference"] = "book"
49
+
50
+ # --- Genres ---
51
+ for genre, keywords in GENRES_KEYWORDS.items():
52
+ if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
53
+ parsed_tags["genres"].append(genre)
54
+
55
+ # Remove duplicates and normalize genres (e.g., 'young adult' as genre can be 'target_audience')
56
+ parsed_tags["genres"] = list(set(parsed_tags["genres"]))
57
+
58
+ # --- Moods / Tone ---
59
+ for mood, keywords in MOOD_KEYWORDS.items():
60
+ if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
61
+ if mood not in parsed_tags["mood"]:
62
+ parsed_tags["mood"].append(mood)
63
+ parsed_tags["mood"] = list(set(parsed_tags["mood"]))
64
+
65
+ # --- Target Audience ---
66
+ if re.search(r'\b(children|kid|kids|child(?:ren\'s)?|younger audiences?|juvenile)\b', query_lower):
67
+ parsed_tags["target_audience"] = "children"
68
+ if "children" in parsed_tags["genres"]: parsed_tags["genres"].remove("children")
69
+ elif re.search(r'\b(young adult|teen|teens|ya|adolescent)\b', query_lower):
70
+ parsed_tags["target_audience"] = "young_adult"
71
+ if "young adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("young adult")
72
+ elif re.search(r'\b(adult|mature|grown-up|general audiences?)\b', query_lower):
73
+ parsed_tags["target_audience"] = "adult"
74
+ if "adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("adult")
75
+
76
+ # --- Era ---
77
+ if re.search(r'\b(classic|classical|old|vintage|timeless)\b', query_lower):
78
+ parsed_tags["era"] = "classic"
79
+ elif re.search(r'\b(contemporary|modern|recent|present-day|current)\b', query_lower):
80
+ parsed_tags["era"] = "contemporary"
81
+ elif re.search(r'\b(historical|period|past|ancient|medieval|victorian|retro)\b', query_lower):
82
+ parsed_tags["era"] = "historical"
83
+ elif re.search(r'\b(future|futuristic)\b', query_lower):
84
+ parsed_tags["era"] = "future"
85
+
86
+
87
+ # --- Decade ---
88
+ decade_match = re.search(r'(\d{2}s|(\d{4})s)\b', query_lower)
89
+ if decade_match:
90
+ decade_str = decade_match.group(1)
91
+ if len(decade_str) == 3: # e.g., '90s'
92
+ if decade_str.startswith('0'):
93
+ parsed_tags["decade"] = "2000s"
94
+ elif decade_str.startswith('10'):
95
+ parsed_tags["decade"] = "2010s"
96
+ elif decade_str.startswith('20'):
97
+ parsed_tags["decade"] = "2020s"
98
+ else:
99
+ parsed_tags["decade"] = f"19{decade_str}"
100
+ elif len(decade_str) == 5: # e.g., '1990s'
101
+ parsed_tags["decade"] = decade_str
102
+ # Explicitly check for "current decade"
103
+ if re.search(r'\b(current|recent) decade\b', query_lower) or re.search(r'\b2020s\b', query_lower):
104
+ parsed_tags["decade"] = "2020s"
105
+
106
+
107
+ # --- Specific Person (Author/Director/Actor) ---
108
+ person_patterns = [
109
+ r'\bby\s+([a-zA-Z\s\.]+)\b',
110
+ r'\b(?:directed\s+by|director)\s+([a-zA-Z\s\.]+)\b',
111
+ r'\b(?:written\s+by|author)\s+([a-zA-Z\s\.]+)\b',
112
+ r'\b(?:starring|featuring|with)\s+([a-zA-Z\s\.]+)\b',
113
+ r'\b(?:from|like)\s+([a-zA-Z\s\.]+)s?\b'
114
+ ]
115
+ for pattern in person_patterns:
116
+ person_match = re.search(pattern, query_lower)
117
+ if person_match:
118
+ person_name = person_match.group(1).strip()
119
+ parsed_tags["specific_person"] = ' '.join([n.capitalize() for n in person_name.split()])
120
+ break
121
+
122
+ # Clean up genres: remove duplicates and ensure audience isn't duplicated
123
+ parsed_tags["genres"] = list(set(parsed_tags["genres"]))
124
+ if parsed_tags["target_audience"] == "young_adult" and "young adult" in parsed_tags["genres"]:
125
+ parsed_tags["genres"].remove("young adult")
126
+ if parsed_tags["target_audience"] == "children" and "children" in parsed_tags["genres"]:
127
+ parsed_tags["genres"].remove("children")
128
+ if parsed_tags["target_audience"] == "adult" and "adult" in parsed_tags["genres"]:
129
+ parsed_tags["genres"].remove("adult")
130
+
131
+ return parsed_tags
132
+
133
+
134
+ if __name__ == '__main__':
135
+ # Test cases for demonstration
136
+ queries = [
137
+ "I want a heartwarming drama movie for young adults from the 90s.",
138
+ "Recommend a thrilling sci-fi book by Isaac Asimov.",
139
+ "A dark mystery by Agatha Christie.",
140
+ "Show me action films for kids under 10.",
141
+ "I need a romantic comedy released in the 2000s.",
142
+ "Any classic historical fiction?",
143
+ "looking for something uplifting for ages 18+",
144
+ "A book about adventure for children.",
145
+ "A suspenseful thriller for adults.",
146
+ "A historical drama set in the 1800s.",
147
+ "A funny animation from the 80s.",
148
+ "A contemporary romance novel.",
149
+ "A classic sci-fi movie directed by Stanley Kubrick.",
150
+ "I want a thriller by Stephen King.",
151
+ "A Japanese film like Akira Kurosawa's.",
152
+ "I'm feeling sad, recommend a melancholic movie.",
153
+ "Give me an exciting thriller movie.",
154
+ "I'm in the mood for something lighthearted.",
155
+ "Looking for a really dark and grim book.",
156
+ "Need something joyful to watch.",
157
+ "I need an uplifting and inspiring film.",
158
+ "Show me a truly gloomy and depressing story.",
159
+ "Find me a film that's both chaotic and funny.",
160
+ "I want something thought-provoking and deep.",
161
+ "Looking for a movie that's really tense and nerve-wracking.",
162
+ "Something wistful and nostalgic.",
163
+ "I'm feeling angry, show me something intense and violent.",
164
+ "Recommend a bizarre and absurd book.",
165
+ "A beautiful and poignant love story.",
166
+ "I need a really witty comedy.",
167
+ "Something raw and gritty.",
168
+ "A grand, sweeping epic.",
169
+ "Something that brings tears to my eyes.",
170
+ "Find me a slow-paced, meditative film.",
171
+ "A mind-bending psychological thriller."
172
+ ]
173
+
174
+ for q in queries:
175
+ parsed = parse_user_query(q)
176
+ print(f"Query: '{q}'")
177
+ print(f"Parsed: {parsed}\n")