Spaces:

DarshanaD
/

Metadata_filtering

Build error

App Files Files Community

DarshanaD commited on May 27, 2025

Commit

ebf59ff

1 Parent(s): 5c4de85

Initial commit

Browse files

Files changed (2) hide show

app.py +436 -53
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,64 +1,447 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
+import boto3
+import json
+import chromadb
+import pandas as pd
+import time
+import re
+from datetime import datetime
+# Sample Bollywood movies data (simplified for demo)
+SAMPLE_MOVIES = [
+    {"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy",
+     "plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."},
+    {"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra",
+     "plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."},
+    {"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker",
+     "plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."},
+    {"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani",
+     "plot": "Two friends search for their missing college friend and recall their engineering days."},
+    {"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari",
+     "plot": "A former wrestler trains his daughters to become world-class wrestlers."},
+    {"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee",
+     "plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."},
+    {"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
+     "plot": "A man creates chaos by lying about his identity to get a job."},
+    {"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
+     "plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."},
+    {"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot",
+     "plot": "A police officer impersonates a crime boss to infiltrate his gang."},
+    {"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi",
+     "plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."},
+    {"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif",
+     "plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."},
+    {"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra",
+     "plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."},
+    {"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl",
+     "plot": "A woman goes on her honeymoon alone after her wedding is called off."},
+    {"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar",
+     "plot": "Three friends go on a bachelor trip to Spain and face their fears."},
+    {"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan",
+     "plot": "An art teacher helps a dyslexic child overcome his learning difficulties."},
+    {"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra",
+     "plot": "College students making a documentary about freedom fighters become revolutionaries themselves."},
+    {"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
+     "plot": "A young man lies about having a mustache to keep his job with a strict boss."},
+    {"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee",
+     "plot": "A friendship is tested when one friend betrays the other for money and power."},
+    {"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar",
+     "plot": "A man's daughter tries to reunite him with his college sweetheart."},
+    {"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar",
+     "plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."}
+]
+# Simple function to connect to AWS Bedrock
+def connect_to_bedrock():
+    try:
+        client = boto3.client('bedrock-runtime', region_name='us-east-1')
+        return client
+    except:
+        st.error("⚠️ AWS Bedrock not configured. Using mock responses for demo.")
+        return None
+# Get embeddings from Bedrock
+def get_embeddings(bedrock_client, text):
+    if not bedrock_client:
+        # Return dummy embedding for demo
+        import random
+        return [random.random() for _ in range(1536)]
+    try:
+        body = json.dumps({"inputText": text})
+        response = bedrock_client.invoke_model(
+            modelId="amazon.titan-embed-text-v1",
+            body=body
+        )
+        result = json.loads(response['body'].read())
+        return result['embedding']
+    except:
+        # Return dummy embedding if API fails
+        import random
+        return [random.random() for _ in range(1536)]
+# Create movie documents and store in ChromaDB
+def setup_movie_database(bedrock_client):
+    st.write("🎬 Setting up Bollywood movies database...")
+    # Create ChromaDB client
+    chroma_client = chromadb.Client()
+    # Create or recreate collection
+    try:
+        chroma_client.delete_collection("bollywood_movies")
+    except:
+        pass
+    collection = chroma_client.create_collection("bollywood_movies")
+    # Prepare data for ChromaDB
+    ids = []
+    documents = []
+    metadatas = []
+    embeddings = []
+    progress_bar = st.progress(0)
+    for i, movie in enumerate(SAMPLE_MOVIES):
+        # Create document text
+        doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}"
+        # Get embedding
+        embedding = get_embeddings(bedrock_client, doc_text)
+        # Prepare data
+        ids.append(str(i))
+        documents.append(doc_text)
+        metadatas.append({
+            'title': movie['title'],
+            'year': movie['year'],
+            'genre': movie['genre'].lower(),
+            'director': movie['director'].lower(),
+            'decade': f"{(movie['year'] // 10) * 10}s"
+        })
+        embeddings.append(embedding)
+        progress_bar.progress((i + 1) / len(SAMPLE_MOVIES))
+    # Add to ChromaDB
+    collection.add(
+        ids=ids,
+        documents=documents,
+        metadatas=metadatas,
+        embeddings=embeddings
+    )
+    st.success(f"✅ Added {len(SAMPLE_MOVIES)} movies to database!")
+    return collection
+# Simple query filter detection
+def detect_filters(query):
+    query_lower = query.lower()
+    filters = {}
+    # Genre detection
+    genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure']
+    for genre in genres:
+        if genre in query_lower:
+            filters['genre'] = genre
+            break
+    # Decade detection
+    decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
+    for decade in decades:
+        if decade in query_lower:
+            filters['decade'] = decade
+            break
+    # Year detection
+    years = re.findall(r'\b(19\d{2}|20\d{2})\b', query)
+    if years:
+        year = int(years[0])
+        filters['decade'] = f"{(year // 10) * 10}s"
+    # Director detection (simple)
+    directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra']
+    for director in directors:
+        if director in query_lower:
+            filters['director'] = director
+            break
+    return filters
+# Retrieve without metadata filter
+def retrieve_without_filter(collection, bedrock_client, query, top_k=5):
+    start_time = time.time()
+    # Get query embedding
+    query_embedding = get_embeddings(bedrock_client, query)
+    # Search without filters
+    results = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=top_k
+    )
+    end_time = time.time()
+    # Format results
+    movies = []
+    for i in range(len(results['documents'][0])):
+        movies.append({
+            'document': results['documents'][0][i],
+            'metadata': results['metadatas'][0][i],
+            'distance': results['distances'][0][i]
+        })
+    return movies, end_time - start_time
+# Retrieve with metadata filter
+def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5):
+    start_time = time.time()
+    # Get query embedding
+    query_embedding = get_embeddings(bedrock_client, query)
+    # Create where clause for filtering
+    where_clause = {}
+    for key, value in filters.items():
+        where_clause[key] = value
+    # Search with filters
+    try:
+        results = collection.query(
+            query_embeddings=[query_embedding],
+            n_results=top_k,
+            where=where_clause
+        )
+    except:
+        # If filtering fails, fall back to no filter
+        results = collection.query(
+            query_embeddings=[query_embedding],
+            n_results=top_k
+        )
+    end_time = time.time()
+    # Format results
+    movies = []
+    for i in range(len(results['documents'][0])):
+        movies.append({
+            'document': results['documents'][0][i],
+            'metadata': results['metadatas'][0][i],
+            'distance': results['distances'][0][i]
+        })
+    return movies, end_time - start_time
+# Generate answer using Bedrock
+def generate_answer(bedrock_client, query, movies):
+    if not bedrock_client:
+        return "🎬 Based on the retrieved movies, here are some recommendations that match your query!"
+    # Create context from movies
+    context = "\n\n".join([movie['document'] for movie in movies])
+    prompt = f"""
+    Based on the following Bollywood movies information, please answer the user's question.
+    Question: {query}
+    Movies Information:
+    {context}
+    Please provide a helpful and informative answer about the movies.
+    """
+    try:
+        body = json.dumps({
+            "anthropic_version": "bedrock-2023-05-31",
+            "max_tokens": 400,
+            "messages": [{"role": "user", "content": prompt}]
+        })
+        response = bedrock_client.invoke_model(
+            modelId="anthropic.claude-3-haiku-20240307-v1:0",
+            body=body
+        )
+        result = json.loads(response['body'].read())
+        return result['content'][0]['text']
+    except:
+        return "🎬 Based on the retrieved movies, here are some great recommendations that match your query!"
+# Main app
+def main():
+    st.title("🎬 Bollywood Movies RAG with Metadata Filtering")
+    st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!")
+    # Initialize session state
+    if 'collection' not in st.session_state:
+        st.session_state.collection = None
+    if 'setup_done' not in st.session_state:
+        st.session_state.setup_done = False
+    # Setup section
+    if not st.session_state.setup_done:
+        st.subheader("🛠️ Setup Movie Database")
+        if st.button("🚀 Load Bollywood Movies Data"):
+            try:
+                bedrock_client = connect_to_bedrock()
+                collection = setup_movie_database(bedrock_client)
+                st.session_state.collection = collection
+                st.session_state.bedrock_client = bedrock_client
+                st.session_state.setup_done = True
+                st.balloons()
+            except Exception as e:
+                st.error(f"❌ Setup failed: {str(e)}")
+    else:
+        st.success("✅ Movie database is ready!")
+        # Sample queries
+        st.subheader("🔍 Try These Sample Queries")
+        sample_queries = [
+            "What are some good action movies?",
+            "Tell me a few comedy movies from the 1970s",
+            "What is the movie Sholay about?",
+            "Tell me a few movies directed by Hrishikesh Mukherjee",
+            "What are some romantic movies from the 1990s?"
+        ]
+        query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries)
+        if query_option == "Custom Query":
+            query = st.text_input("Enter your question about Bollywood movies:")
+        else:
+            query = query_option
+            st.write(f"Selected: **{query}**")
+        if query:
+            if st.button("🔍 Search Movies"):
+                try:
+                    bedrock_client = st.session_state.bedrock_client
+                    collection = st.session_state.collection
+                    # Detect filters
+                    filters = detect_filters(query)
+                    st.write("---")
+                    # Method 1: Without metadata filter
+                    st.subheader("📊 Method 1: Without Metadata Filter")
+                    movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query)
+                    st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
+                    st.write("**Retrieved Movies:**")
+                    for i, movie in enumerate(movies_no_filter, 1):
+                        with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
+                            st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
+                            st.write(f"**Director:** {movie['metadata']['director'].title()}")
+                            st.write(f"**Distance:** {movie['distance']:.4f}")
+                    # Method 2: With metadata filter
+                    st.subheader("🎯 Method 2: With Metadata Filter")
+                    if filters:
+                        st.write(f"**Detected Filters:** {filters}")
+                        movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters)
+                        st.write(f"⏱️ **Time taken: {time_with_filter:.4f} seconds**")
+                        st.write("**Filtered Retrieved Movies:**")
+                        for i, movie in enumerate(movies_with_filter, 1):
+                            with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
+                                st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
+                                st.write(f"**Director:** {movie['metadata']['director'].title()}")
+                                st.write(f"**Distance:** {movie['distance']:.4f}")
+                        # Performance comparison
+                        st.subheader("⚡ Performance Comparison")
+                        col1, col2, col3 = st.columns(3)
+                        with col1:
+                            st.metric("Without Filter", f"{time_no_filter:.4f}s")
+                        with col2:
+                            st.metric("With Filter", f"{time_with_filter:.4f}s")
+                        with col3:
+                            speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0
+                            st.metric("Speedup", f"{speedup:.1f}%")
+                        # Generate final answer
+                        st.subheader("🤖 AI Generated Answer")
+                        answer = generate_answer(bedrock_client, query, movies_with_filter)
+                        st.success(answer)
+                    else:
+                        st.write("**No specific filters detected** - using general retrieval")
+                        st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
+                        # Generate answer with no filter results
+                        st.subheader("🤖 AI Generated Answer")
+                        answer = generate_answer(bedrock_client, query, movies_no_filter)
+                        st.success(answer)
+                except Exception as e:
+                    st.error(f"❌ Search failed: {str(e)}")
+        # Show movie database
+        if st.checkbox("📋 Show All Movies in Database"):
+            st.subheader("Movie Database")
+            df = pd.DataFrame(SAMPLE_MOVIES)
+            st.dataframe(df)
+        # Reset button
+        if st.button("🔄 Reset Database"):
+            st.session_state.collection = None
+            st.session_state.setup_done = False
+            st.rerun()
+# Installation and deployment guide
+def show_guides():
+    col1, col2 = st.columns(2)
+    with col1:
+        with st.expander("📖 Installation Guide"):
+            st.markdown("""
+            **Step 1: Install Libraries**
+            ```bash
+            pip install streamlit boto3 chromadb pandas
+            ```
+            **Step 2: Setup AWS**
+            ```bash
+            aws configure
+            ```
+            **Step 3: Run Locally**
+            ```bash
+            streamlit run bollywood_rag.py
+            ```
+            """)
+    with col2:
+        with st.expander("🚀 Deploy to Hugging Face"):
+            st.markdown("""
+            **Step 1: Create files**
+            - `app.py` (this code)
+            - `requirements.txt`
+            - `README.md`
+            **Step 2: requirements.txt**
+            ```
+            streamlit
+            boto3
+            chromadb
+            pandas
+            ```
+            **Step 3: Deploy**
+            1. Push to GitHub
+            2. Connect to Hugging Face Spaces
+            3. Select Streamlit SDK
+            4. Add AWS secrets in settings
+            """)
+###
+# Run the app
 if __name__ == "__main__":
+    show_guides()
+    main()

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.25.2
+streamlit
+boto3
+chromadb
+pandas