DarshanaD's picture
Initial commit
ebf59ff
import streamlit as st
import boto3
import json
import chromadb
import pandas as pd
import time
import re
from datetime import datetime
# Sample Bollywood movies data (simplified for demo)
SAMPLE_MOVIES = [
{"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy",
"plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."},
{"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra",
"plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."},
{"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker",
"plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."},
{"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani",
"plot": "Two friends search for their missing college friend and recall their engineering days."},
{"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari",
"plot": "A former wrestler trains his daughters to become world-class wrestlers."},
{"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee",
"plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."},
{"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
"plot": "A man creates chaos by lying about his identity to get a job."},
{"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
"plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."},
{"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot",
"plot": "A police officer impersonates a crime boss to infiltrate his gang."},
{"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi",
"plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."},
{"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif",
"plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."},
{"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra",
"plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."},
{"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl",
"plot": "A woman goes on her honeymoon alone after her wedding is called off."},
{"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar",
"plot": "Three friends go on a bachelor trip to Spain and face their fears."},
{"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan",
"plot": "An art teacher helps a dyslexic child overcome his learning difficulties."},
{"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra",
"plot": "College students making a documentary about freedom fighters become revolutionaries themselves."},
{"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
"plot": "A young man lies about having a mustache to keep his job with a strict boss."},
{"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee",
"plot": "A friendship is tested when one friend betrays the other for money and power."},
{"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar",
"plot": "A man's daughter tries to reunite him with his college sweetheart."},
{"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar",
"plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."}
]
# Simple function to connect to AWS Bedrock
def connect_to_bedrock():
try:
client = boto3.client('bedrock-runtime', region_name='us-east-1')
return client
except:
st.error("⚠️ AWS Bedrock not configured. Using mock responses for demo.")
return None
# Get embeddings from Bedrock
def get_embeddings(bedrock_client, text):
if not bedrock_client:
# Return dummy embedding for demo
import random
return [random.random() for _ in range(1536)]
try:
body = json.dumps({"inputText": text})
response = bedrock_client.invoke_model(
modelId="amazon.titan-embed-text-v1",
body=body
)
result = json.loads(response['body'].read())
return result['embedding']
except:
# Return dummy embedding if API fails
import random
return [random.random() for _ in range(1536)]
# Create movie documents and store in ChromaDB
def setup_movie_database(bedrock_client):
st.write("🎬 Setting up Bollywood movies database...")
# Create ChromaDB client
chroma_client = chromadb.Client()
# Create or recreate collection
try:
chroma_client.delete_collection("bollywood_movies")
except:
pass
collection = chroma_client.create_collection("bollywood_movies")
# Prepare data for ChromaDB
ids = []
documents = []
metadatas = []
embeddings = []
progress_bar = st.progress(0)
for i, movie in enumerate(SAMPLE_MOVIES):
# Create document text
doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}"
# Get embedding
embedding = get_embeddings(bedrock_client, doc_text)
# Prepare data
ids.append(str(i))
documents.append(doc_text)
metadatas.append({
'title': movie['title'],
'year': movie['year'],
'genre': movie['genre'].lower(),
'director': movie['director'].lower(),
'decade': f"{(movie['year'] // 10) * 10}s"
})
embeddings.append(embedding)
progress_bar.progress((i + 1) / len(SAMPLE_MOVIES))
# Add to ChromaDB
collection.add(
ids=ids,
documents=documents,
metadatas=metadatas,
embeddings=embeddings
)
st.success(f"βœ… Added {len(SAMPLE_MOVIES)} movies to database!")
return collection
# Simple query filter detection
def detect_filters(query):
query_lower = query.lower()
filters = {}
# Genre detection
genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure']
for genre in genres:
if genre in query_lower:
filters['genre'] = genre
break
# Decade detection
decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
for decade in decades:
if decade in query_lower:
filters['decade'] = decade
break
# Year detection
years = re.findall(r'\b(19\d{2}|20\d{2})\b', query)
if years:
year = int(years[0])
filters['decade'] = f"{(year // 10) * 10}s"
# Director detection (simple)
directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra']
for director in directors:
if director in query_lower:
filters['director'] = director
break
return filters
# Retrieve without metadata filter
def retrieve_without_filter(collection, bedrock_client, query, top_k=5):
start_time = time.time()
# Get query embedding
query_embedding = get_embeddings(bedrock_client, query)
# Search without filters
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k
)
end_time = time.time()
# Format results
movies = []
for i in range(len(results['documents'][0])):
movies.append({
'document': results['documents'][0][i],
'metadata': results['metadatas'][0][i],
'distance': results['distances'][0][i]
})
return movies, end_time - start_time
# Retrieve with metadata filter
def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5):
start_time = time.time()
# Get query embedding
query_embedding = get_embeddings(bedrock_client, query)
# Create where clause for filtering
where_clause = {}
for key, value in filters.items():
where_clause[key] = value
# Search with filters
try:
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
where=where_clause
)
except:
# If filtering fails, fall back to no filter
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k
)
end_time = time.time()
# Format results
movies = []
for i in range(len(results['documents'][0])):
movies.append({
'document': results['documents'][0][i],
'metadata': results['metadatas'][0][i],
'distance': results['distances'][0][i]
})
return movies, end_time - start_time
# Generate answer using Bedrock
def generate_answer(bedrock_client, query, movies):
if not bedrock_client:
return "🎬 Based on the retrieved movies, here are some recommendations that match your query!"
# Create context from movies
context = "\n\n".join([movie['document'] for movie in movies])
prompt = f"""
Based on the following Bollywood movies information, please answer the user's question.
Question: {query}
Movies Information:
{context}
Please provide a helpful and informative answer about the movies.
"""
try:
body = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 400,
"messages": [{"role": "user", "content": prompt}]
})
response = bedrock_client.invoke_model(
modelId="anthropic.claude-3-haiku-20240307-v1:0",
body=body
)
result = json.loads(response['body'].read())
return result['content'][0]['text']
except:
return "🎬 Based on the retrieved movies, here are some great recommendations that match your query!"
# Main app
def main():
st.title("🎬 Bollywood Movies RAG with Metadata Filtering")
st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!")
# Initialize session state
if 'collection' not in st.session_state:
st.session_state.collection = None
if 'setup_done' not in st.session_state:
st.session_state.setup_done = False
# Setup section
if not st.session_state.setup_done:
st.subheader("πŸ› οΈ Setup Movie Database")
if st.button("πŸš€ Load Bollywood Movies Data"):
try:
bedrock_client = connect_to_bedrock()
collection = setup_movie_database(bedrock_client)
st.session_state.collection = collection
st.session_state.bedrock_client = bedrock_client
st.session_state.setup_done = True
st.balloons()
except Exception as e:
st.error(f"❌ Setup failed: {str(e)}")
else:
st.success("βœ… Movie database is ready!")
# Sample queries
st.subheader("πŸ” Try These Sample Queries")
sample_queries = [
"What are some good action movies?",
"Tell me a few comedy movies from the 1970s",
"What is the movie Sholay about?",
"Tell me a few movies directed by Hrishikesh Mukherjee",
"What are some romantic movies from the 1990s?"
]
query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries)
if query_option == "Custom Query":
query = st.text_input("Enter your question about Bollywood movies:")
else:
query = query_option
st.write(f"Selected: **{query}**")
if query:
if st.button("πŸ” Search Movies"):
try:
bedrock_client = st.session_state.bedrock_client
collection = st.session_state.collection
# Detect filters
filters = detect_filters(query)
st.write("---")
# Method 1: Without metadata filter
st.subheader("πŸ“Š Method 1: Without Metadata Filter")
movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query)
st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
st.write("**Retrieved Movies:**")
for i, movie in enumerate(movies_no_filter, 1):
with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
st.write(f"**Director:** {movie['metadata']['director'].title()}")
st.write(f"**Distance:** {movie['distance']:.4f}")
# Method 2: With metadata filter
st.subheader("🎯 Method 2: With Metadata Filter")
if filters:
st.write(f"**Detected Filters:** {filters}")
movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters)
st.write(f"⏱️ **Time taken: {time_with_filter:.4f} seconds**")
st.write("**Filtered Retrieved Movies:**")
for i, movie in enumerate(movies_with_filter, 1):
with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
st.write(f"**Director:** {movie['metadata']['director'].title()}")
st.write(f"**Distance:** {movie['distance']:.4f}")
# Performance comparison
st.subheader("⚑ Performance Comparison")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Without Filter", f"{time_no_filter:.4f}s")
with col2:
st.metric("With Filter", f"{time_with_filter:.4f}s")
with col3:
speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0
st.metric("Speedup", f"{speedup:.1f}%")
# Generate final answer
st.subheader("πŸ€– AI Generated Answer")
answer = generate_answer(bedrock_client, query, movies_with_filter)
st.success(answer)
else:
st.write("**No specific filters detected** - using general retrieval")
st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
# Generate answer with no filter results
st.subheader("πŸ€– AI Generated Answer")
answer = generate_answer(bedrock_client, query, movies_no_filter)
st.success(answer)
except Exception as e:
st.error(f"❌ Search failed: {str(e)}")
# Show movie database
if st.checkbox("πŸ“‹ Show All Movies in Database"):
st.subheader("Movie Database")
df = pd.DataFrame(SAMPLE_MOVIES)
st.dataframe(df)
# Reset button
if st.button("πŸ”„ Reset Database"):
st.session_state.collection = None
st.session_state.setup_done = False
st.rerun()
# Installation and deployment guide
def show_guides():
col1, col2 = st.columns(2)
with col1:
with st.expander("πŸ“– Installation Guide"):
st.markdown("""
**Step 1: Install Libraries**
```bash
pip install streamlit boto3 chromadb pandas
```
**Step 2: Setup AWS**
```bash
aws configure
```
**Step 3: Run Locally**
```bash
streamlit run bollywood_rag.py
```
""")
with col2:
with st.expander("πŸš€ Deploy to Hugging Face"):
st.markdown("""
**Step 1: Create files**
- `app.py` (this code)
- `requirements.txt`
- `README.md`
**Step 2: requirements.txt**
```
streamlit
boto3
chromadb
pandas
```
**Step 3: Deploy**
1. Push to GitHub
2. Connect to Hugging Face Spaces
3. Select Streamlit SDK
4. Add AWS secrets in settings
""")
###
# Run the app
if __name__ == "__main__":
show_guides()
main()