Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import boto3 | |
| import json | |
| import chromadb | |
| import pandas as pd | |
| import time | |
| import re | |
| from datetime import datetime | |
| # Sample Bollywood movies data (simplified for demo) | |
| SAMPLE_MOVIES = [ | |
| {"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy", | |
| "plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."}, | |
| {"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra", | |
| "plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."}, | |
| {"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker", | |
| "plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."}, | |
| {"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani", | |
| "plot": "Two friends search for their missing college friend and recall their engineering days."}, | |
| {"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari", | |
| "plot": "A former wrestler trains his daughters to become world-class wrestlers."}, | |
| {"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee", | |
| "plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."}, | |
| {"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
| "plot": "A man creates chaos by lying about his identity to get a job."}, | |
| {"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
| "plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."}, | |
| {"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot", | |
| "plot": "A police officer impersonates a crime boss to infiltrate his gang."}, | |
| {"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi", | |
| "plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."}, | |
| {"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif", | |
| "plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."}, | |
| {"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra", | |
| "plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."}, | |
| {"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl", | |
| "plot": "A woman goes on her honeymoon alone after her wedding is called off."}, | |
| {"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar", | |
| "plot": "Three friends go on a bachelor trip to Spain and face their fears."}, | |
| {"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan", | |
| "plot": "An art teacher helps a dyslexic child overcome his learning difficulties."}, | |
| {"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra", | |
| "plot": "College students making a documentary about freedom fighters become revolutionaries themselves."}, | |
| {"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
| "plot": "A young man lies about having a mustache to keep his job with a strict boss."}, | |
| {"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee", | |
| "plot": "A friendship is tested when one friend betrays the other for money and power."}, | |
| {"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar", | |
| "plot": "A man's daughter tries to reunite him with his college sweetheart."}, | |
| {"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar", | |
| "plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."} | |
| ] | |
| # Simple function to connect to AWS Bedrock | |
| def connect_to_bedrock(): | |
| try: | |
| client = boto3.client('bedrock-runtime', region_name='us-east-1') | |
| return client | |
| except: | |
| st.error("β οΈ AWS Bedrock not configured. Using mock responses for demo.") | |
| return None | |
| # Get embeddings from Bedrock | |
| def get_embeddings(bedrock_client, text): | |
| if not bedrock_client: | |
| # Return dummy embedding for demo | |
| import random | |
| return [random.random() for _ in range(1536)] | |
| try: | |
| body = json.dumps({"inputText": text}) | |
| response = bedrock_client.invoke_model( | |
| modelId="amazon.titan-embed-text-v1", | |
| body=body | |
| ) | |
| result = json.loads(response['body'].read()) | |
| return result['embedding'] | |
| except: | |
| # Return dummy embedding if API fails | |
| import random | |
| return [random.random() for _ in range(1536)] | |
| # Create movie documents and store in ChromaDB | |
| def setup_movie_database(bedrock_client): | |
| st.write("π¬ Setting up Bollywood movies database...") | |
| # Create ChromaDB client | |
| chroma_client = chromadb.Client() | |
| # Create or recreate collection | |
| try: | |
| chroma_client.delete_collection("bollywood_movies") | |
| except: | |
| pass | |
| collection = chroma_client.create_collection("bollywood_movies") | |
| # Prepare data for ChromaDB | |
| ids = [] | |
| documents = [] | |
| metadatas = [] | |
| embeddings = [] | |
| progress_bar = st.progress(0) | |
| for i, movie in enumerate(SAMPLE_MOVIES): | |
| # Create document text | |
| doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}" | |
| # Get embedding | |
| embedding = get_embeddings(bedrock_client, doc_text) | |
| # Prepare data | |
| ids.append(str(i)) | |
| documents.append(doc_text) | |
| metadatas.append({ | |
| 'title': movie['title'], | |
| 'year': movie['year'], | |
| 'genre': movie['genre'].lower(), | |
| 'director': movie['director'].lower(), | |
| 'decade': f"{(movie['year'] // 10) * 10}s" | |
| }) | |
| embeddings.append(embedding) | |
| progress_bar.progress((i + 1) / len(SAMPLE_MOVIES)) | |
| # Add to ChromaDB | |
| collection.add( | |
| ids=ids, | |
| documents=documents, | |
| metadatas=metadatas, | |
| embeddings=embeddings | |
| ) | |
| st.success(f"β Added {len(SAMPLE_MOVIES)} movies to database!") | |
| return collection | |
| # Simple query filter detection | |
| def detect_filters(query): | |
| query_lower = query.lower() | |
| filters = {} | |
| # Genre detection | |
| genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure'] | |
| for genre in genres: | |
| if genre in query_lower: | |
| filters['genre'] = genre | |
| break | |
| # Decade detection | |
| decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s'] | |
| for decade in decades: | |
| if decade in query_lower: | |
| filters['decade'] = decade | |
| break | |
| # Year detection | |
| years = re.findall(r'\b(19\d{2}|20\d{2})\b', query) | |
| if years: | |
| year = int(years[0]) | |
| filters['decade'] = f"{(year // 10) * 10}s" | |
| # Director detection (simple) | |
| directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra'] | |
| for director in directors: | |
| if director in query_lower: | |
| filters['director'] = director | |
| break | |
| return filters | |
| # Retrieve without metadata filter | |
| def retrieve_without_filter(collection, bedrock_client, query, top_k=5): | |
| start_time = time.time() | |
| # Get query embedding | |
| query_embedding = get_embeddings(bedrock_client, query) | |
| # Search without filters | |
| results = collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=top_k | |
| ) | |
| end_time = time.time() | |
| # Format results | |
| movies = [] | |
| for i in range(len(results['documents'][0])): | |
| movies.append({ | |
| 'document': results['documents'][0][i], | |
| 'metadata': results['metadatas'][0][i], | |
| 'distance': results['distances'][0][i] | |
| }) | |
| return movies, end_time - start_time | |
| # Retrieve with metadata filter | |
| def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5): | |
| start_time = time.time() | |
| # Get query embedding | |
| query_embedding = get_embeddings(bedrock_client, query) | |
| # Create where clause for filtering | |
| where_clause = {} | |
| for key, value in filters.items(): | |
| where_clause[key] = value | |
| # Search with filters | |
| try: | |
| results = collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=top_k, | |
| where=where_clause | |
| ) | |
| except: | |
| # If filtering fails, fall back to no filter | |
| results = collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=top_k | |
| ) | |
| end_time = time.time() | |
| # Format results | |
| movies = [] | |
| for i in range(len(results['documents'][0])): | |
| movies.append({ | |
| 'document': results['documents'][0][i], | |
| 'metadata': results['metadatas'][0][i], | |
| 'distance': results['distances'][0][i] | |
| }) | |
| return movies, end_time - start_time | |
| # Generate answer using Bedrock | |
| def generate_answer(bedrock_client, query, movies): | |
| if not bedrock_client: | |
| return "π¬ Based on the retrieved movies, here are some recommendations that match your query!" | |
| # Create context from movies | |
| context = "\n\n".join([movie['document'] for movie in movies]) | |
| prompt = f""" | |
| Based on the following Bollywood movies information, please answer the user's question. | |
| Question: {query} | |
| Movies Information: | |
| {context} | |
| Please provide a helpful and informative answer about the movies. | |
| """ | |
| try: | |
| body = json.dumps({ | |
| "anthropic_version": "bedrock-2023-05-31", | |
| "max_tokens": 400, | |
| "messages": [{"role": "user", "content": prompt}] | |
| }) | |
| response = bedrock_client.invoke_model( | |
| modelId="anthropic.claude-3-haiku-20240307-v1:0", | |
| body=body | |
| ) | |
| result = json.loads(response['body'].read()) | |
| return result['content'][0]['text'] | |
| except: | |
| return "π¬ Based on the retrieved movies, here are some great recommendations that match your query!" | |
| # Main app | |
| def main(): | |
| st.title("π¬ Bollywood Movies RAG with Metadata Filtering") | |
| st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!") | |
| # Initialize session state | |
| if 'collection' not in st.session_state: | |
| st.session_state.collection = None | |
| if 'setup_done' not in st.session_state: | |
| st.session_state.setup_done = False | |
| # Setup section | |
| if not st.session_state.setup_done: | |
| st.subheader("π οΈ Setup Movie Database") | |
| if st.button("π Load Bollywood Movies Data"): | |
| try: | |
| bedrock_client = connect_to_bedrock() | |
| collection = setup_movie_database(bedrock_client) | |
| st.session_state.collection = collection | |
| st.session_state.bedrock_client = bedrock_client | |
| st.session_state.setup_done = True | |
| st.balloons() | |
| except Exception as e: | |
| st.error(f"β Setup failed: {str(e)}") | |
| else: | |
| st.success("β Movie database is ready!") | |
| # Sample queries | |
| st.subheader("π Try These Sample Queries") | |
| sample_queries = [ | |
| "What are some good action movies?", | |
| "Tell me a few comedy movies from the 1970s", | |
| "What is the movie Sholay about?", | |
| "Tell me a few movies directed by Hrishikesh Mukherjee", | |
| "What are some romantic movies from the 1990s?" | |
| ] | |
| query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries) | |
| if query_option == "Custom Query": | |
| query = st.text_input("Enter your question about Bollywood movies:") | |
| else: | |
| query = query_option | |
| st.write(f"Selected: **{query}**") | |
| if query: | |
| if st.button("π Search Movies"): | |
| try: | |
| bedrock_client = st.session_state.bedrock_client | |
| collection = st.session_state.collection | |
| # Detect filters | |
| filters = detect_filters(query) | |
| st.write("---") | |
| # Method 1: Without metadata filter | |
| st.subheader("π Method 1: Without Metadata Filter") | |
| movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query) | |
| st.write(f"β±οΈ **Time taken: {time_no_filter:.4f} seconds**") | |
| st.write("**Retrieved Movies:**") | |
| for i, movie in enumerate(movies_no_filter, 1): | |
| with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): | |
| st.write(f"**Genre:** {movie['metadata']['genre'].title()}") | |
| st.write(f"**Director:** {movie['metadata']['director'].title()}") | |
| st.write(f"**Distance:** {movie['distance']:.4f}") | |
| # Method 2: With metadata filter | |
| st.subheader("π― Method 2: With Metadata Filter") | |
| if filters: | |
| st.write(f"**Detected Filters:** {filters}") | |
| movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters) | |
| st.write(f"β±οΈ **Time taken: {time_with_filter:.4f} seconds**") | |
| st.write("**Filtered Retrieved Movies:**") | |
| for i, movie in enumerate(movies_with_filter, 1): | |
| with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): | |
| st.write(f"**Genre:** {movie['metadata']['genre'].title()}") | |
| st.write(f"**Director:** {movie['metadata']['director'].title()}") | |
| st.write(f"**Distance:** {movie['distance']:.4f}") | |
| # Performance comparison | |
| st.subheader("β‘ Performance Comparison") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Without Filter", f"{time_no_filter:.4f}s") | |
| with col2: | |
| st.metric("With Filter", f"{time_with_filter:.4f}s") | |
| with col3: | |
| speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0 | |
| st.metric("Speedup", f"{speedup:.1f}%") | |
| # Generate final answer | |
| st.subheader("π€ AI Generated Answer") | |
| answer = generate_answer(bedrock_client, query, movies_with_filter) | |
| st.success(answer) | |
| else: | |
| st.write("**No specific filters detected** - using general retrieval") | |
| st.write(f"β±οΈ **Time taken: {time_no_filter:.4f} seconds**") | |
| # Generate answer with no filter results | |
| st.subheader("π€ AI Generated Answer") | |
| answer = generate_answer(bedrock_client, query, movies_no_filter) | |
| st.success(answer) | |
| except Exception as e: | |
| st.error(f"β Search failed: {str(e)}") | |
| # Show movie database | |
| if st.checkbox("π Show All Movies in Database"): | |
| st.subheader("Movie Database") | |
| df = pd.DataFrame(SAMPLE_MOVIES) | |
| st.dataframe(df) | |
| # Reset button | |
| if st.button("π Reset Database"): | |
| st.session_state.collection = None | |
| st.session_state.setup_done = False | |
| st.rerun() | |
| # Installation and deployment guide | |
| def show_guides(): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| with st.expander("π Installation Guide"): | |
| st.markdown(""" | |
| **Step 1: Install Libraries** | |
| ```bash | |
| pip install streamlit boto3 chromadb pandas | |
| ``` | |
| **Step 2: Setup AWS** | |
| ```bash | |
| aws configure | |
| ``` | |
| **Step 3: Run Locally** | |
| ```bash | |
| streamlit run bollywood_rag.py | |
| ``` | |
| """) | |
| with col2: | |
| with st.expander("π Deploy to Hugging Face"): | |
| st.markdown(""" | |
| **Step 1: Create files** | |
| - `app.py` (this code) | |
| - `requirements.txt` | |
| - `README.md` | |
| **Step 2: requirements.txt** | |
| ``` | |
| streamlit | |
| boto3 | |
| chromadb | |
| pandas | |
| ``` | |
| **Step 3: Deploy** | |
| 1. Push to GitHub | |
| 2. Connect to Hugging Face Spaces | |
| 3. Select Streamlit SDK | |
| 4. Add AWS secrets in settings | |
| """) | |
| ### | |
| # Run the app | |
| if __name__ == "__main__": | |
| show_guides() | |
| main() |