Spaces:
Sleeping
Sleeping
File size: 7,683 Bytes
9280c07 ecf1e4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | import re
from typing import Dict, List, Any
# Import keywords from separate files
from utils.genres_data import GENRES_KEYWORDS
from utils.moods_data import MOOD_KEYWORDS
def parse_user_query(query: str) -> Dict[str, Any]:
"""
Parses a natural language user query to extract structured tags
like genres, moods, target audience, era, decade, specific person, and media type preference.
Args:
query (str): The user's input query string.
Returns:
Dict[str, Any]: A dictionary containing extracted tags.
Example: {
"genres": ["sci-fi", "thriller"],
"mood": ["suspenseful", "dark"],
"target_audience": "adult", # or "children", "young_adult"
"era": "modern", # or "classic", "contemporary"
"decade": "90s", # e.g., "1990s" -> "90s"
"specific_person": "Christopher Nolan", # author or director
"media_type_preference": "book" # or "movie", or None
}
"""
query_lower = query.lower()
parsed_tags: Dict[str, Any] = {
"genres": [],
"mood": [],
"target_audience": None,
"era": None,
"decade": None,
"specific_person": None,
"media_type_preference": None,
"raw_query": query # Keep original query for debugging/explanation
}
# --- Media Type Preference (strong indicator) ---
if re.search(r'\b(movie|film|picture|flick)s?\b', query_lower):
parsed_tags["media_type_preference"] = "movie"
if re.search(r'\b(book|novel|read|story)s?\b', query_lower):
parsed_tags["media_type_preference"] = "book"
# --- Genres ---
for genre, keywords in GENRES_KEYWORDS.items():
if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
parsed_tags["genres"].append(genre)
# Remove duplicates and normalize genres (e.g., 'young adult' as genre can be 'target_audience')
parsed_tags["genres"] = list(set(parsed_tags["genres"]))
# --- Moods / Tone ---
for mood, keywords in MOOD_KEYWORDS.items():
if any(re.search(r'\b' + re.escape(k) + r'\b', query_lower) for k in keywords):
if mood not in parsed_tags["mood"]:
parsed_tags["mood"].append(mood)
parsed_tags["mood"] = list(set(parsed_tags["mood"]))
# --- Target Audience ---
if re.search(r'\b(children|kid|kids|child(?:ren\'s)?|younger audiences?|juvenile)\b', query_lower):
parsed_tags["target_audience"] = "children"
if "children" in parsed_tags["genres"]: parsed_tags["genres"].remove("children")
elif re.search(r'\b(young adult|teen|teens|ya|adolescent)\b', query_lower):
parsed_tags["target_audience"] = "young_adult"
if "young adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("young adult")
elif re.search(r'\b(adult|mature|grown-up|general audiences?)\b', query_lower):
parsed_tags["target_audience"] = "adult"
if "adult" in parsed_tags["genres"]: parsed_tags["genres"].remove("adult")
# --- Era ---
if re.search(r'\b(classic|classical|old|vintage|timeless)\b', query_lower):
parsed_tags["era"] = "classic"
elif re.search(r'\b(contemporary|modern|recent|present-day|current)\b', query_lower):
parsed_tags["era"] = "contemporary"
elif re.search(r'\b(historical|period|past|ancient|medieval|victorian|retro)\b', query_lower):
parsed_tags["era"] = "historical"
elif re.search(r'\b(future|futuristic)\b', query_lower):
parsed_tags["era"] = "future"
# --- Decade ---
decade_match = re.search(r'(\d{2}s|(\d{4})s)\b', query_lower)
if decade_match:
decade_str = decade_match.group(1)
if len(decade_str) == 3: # e.g., '90s'
if decade_str.startswith('0'):
parsed_tags["decade"] = "2000s"
elif decade_str.startswith('10'):
parsed_tags["decade"] = "2010s"
elif decade_str.startswith('20'):
parsed_tags["decade"] = "2020s"
else:
parsed_tags["decade"] = f"19{decade_str}"
elif len(decade_str) == 5: # e.g., '1990s'
parsed_tags["decade"] = decade_str
# Explicitly check for "current decade"
if re.search(r'\b(current|recent) decade\b', query_lower) or re.search(r'\b2020s\b', query_lower):
parsed_tags["decade"] = "2020s"
# --- Specific Person (Author/Director/Actor) ---
person_patterns = [
r'\bby\s+([a-zA-Z\s\.]+)\b',
r'\b(?:directed\s+by|director)\s+([a-zA-Z\s\.]+)\b',
r'\b(?:written\s+by|author)\s+([a-zA-Z\s\.]+)\b',
r'\b(?:starring|featuring|with)\s+([a-zA-Z\s\.]+)\b',
r'\b(?:from|like)\s+([a-zA-Z\s\.]+)s?\b'
]
for pattern in person_patterns:
person_match = re.search(pattern, query_lower)
if person_match:
person_name = person_match.group(1).strip()
parsed_tags["specific_person"] = ' '.join([n.capitalize() for n in person_name.split()])
break
# Clean up genres: remove duplicates and ensure audience isn't duplicated
parsed_tags["genres"] = list(set(parsed_tags["genres"]))
if parsed_tags["target_audience"] == "young_adult" and "young adult" in parsed_tags["genres"]:
parsed_tags["genres"].remove("young adult")
if parsed_tags["target_audience"] == "children" and "children" in parsed_tags["genres"]:
parsed_tags["genres"].remove("children")
if parsed_tags["target_audience"] == "adult" and "adult" in parsed_tags["genres"]:
parsed_tags["genres"].remove("adult")
return parsed_tags
if __name__ == '__main__':
# Test cases for demonstration
queries = [
"I want a heartwarming drama movie for young adults from the 90s.",
"Recommend a thrilling sci-fi book by Isaac Asimov.",
"A dark mystery by Agatha Christie.",
"Show me action films for kids under 10.",
"I need a romantic comedy released in the 2000s.",
"Any classic historical fiction?",
"looking for something uplifting for ages 18+",
"A book about adventure for children.",
"A suspenseful thriller for adults.",
"A historical drama set in the 1800s.",
"A funny animation from the 80s.",
"A contemporary romance novel.",
"A classic sci-fi movie directed by Stanley Kubrick.",
"I want a thriller by Stephen King.",
"A Japanese film like Akira Kurosawa's.",
"I'm feeling sad, recommend a melancholic movie.",
"Give me an exciting thriller movie.",
"I'm in the mood for something lighthearted.",
"Looking for a really dark and grim book.",
"Need something joyful to watch.",
"I need an uplifting and inspiring film.",
"Show me a truly gloomy and depressing story.",
"Find me a film that's both chaotic and funny.",
"I want something thought-provoking and deep.",
"Looking for a movie that's really tense and nerve-wracking.",
"Something wistful and nostalgic.",
"I'm feeling angry, show me something intense and violent.",
"Recommend a bizarre and absurd book.",
"A beautiful and poignant love story.",
"I need a really witty comedy.",
"Something raw and gritty.",
"A grand, sweeping epic.",
"Something that brings tears to my eyes.",
"Find me a slow-paced, meditative film.",
"A mind-bending psychological thriller."
]
for q in queries:
parsed = parse_user_query(q)
print(f"Query: '{q}'")
print(f"Parsed: {parsed}\n") |