Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from langchain.schema import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| import gradio as gr | |
| import os | |
| from collections import Counter | |
| from openai import OpenAI | |
| # ============================================ | |
| # DATE PARSING | |
| # ============================================ | |
| def interpret_date_filter_from_question(question, reference_date=None): | |
| """Extract date range from question""" | |
| q = (question or '').lower() | |
| if reference_date is None: | |
| reference_date = datetime.now() | |
| ref = reference_date | |
| m = re.search(r'last\s+(\d+)\s+days?', q) | |
| if m: | |
| n = int(m.group(1)) | |
| end = ref | |
| start = ref - timedelta(days=n) | |
| return datetime(start.year, start.month, start.day), datetime(end.year, end.month, end.day, 23, 59, 59) | |
| if 'last week' in q or 'first week' in q: | |
| end = ref | |
| start = ref - timedelta(days=7) | |
| return datetime(start.year, start.month, start.day), datetime(end.year, end.month, end.day, 23, 59, 59) | |
| if 'first 30 days' in q or 'first month' in q: | |
| start = ref - timedelta(days=30) | |
| end = ref | |
| return datetime(start.year, start.month, start.day), datetime(end.year, end.month, end.day, 23, 59, 59) | |
| return None, None | |
| def filter_chunks_by_date(chunks, start, end): | |
| """Filter document chunks by date range""" | |
| if start is None or end is None: | |
| return chunks | |
| filtered = [] | |
| for doc in chunks: | |
| try: | |
| if "time" in doc.metadata and doc.metadata["time"]: | |
| doc_time = datetime.strptime(doc.metadata["time"], "%d/%m/%Y") | |
| if start <= doc_time <= end: | |
| filtered.append(doc) | |
| except Exception: | |
| continue | |
| return filtered if filtered else chunks | |
| # ============================================ | |
| # DOCUMENT PROCESSING | |
| # ============================================ | |
| def build_documents(df): | |
| """Convert dataframe to Document objects""" | |
| documents = [] | |
| for _, row in df.iterrows(): | |
| metadata = { | |
| "topic": str(row.get("predicted_topic", "")), | |
| "sentiment": str(row.get("predicted_sentiment", "")), | |
| "time": str(row.get("time", "")) | |
| } | |
| doc = Document( | |
| page_content=str(row.get("correctmapping_nosym", row.get("text", ""))), | |
| metadata=metadata | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def chunk_documents(documents, chunk_size=400, chunk_overlap=50): | |
| """Split documents into smaller chunks""" | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| return splitter.split_documents(documents) | |
| def analyze_metadata(retrieved_docs): | |
| """Extract statistics from retrieved documents metadata""" | |
| topics = [d.metadata.get('topic', 'Unknown') for d in retrieved_docs if d.metadata.get('topic')] | |
| sentiments = [d.metadata.get('sentiment', 'Unknown') for d in retrieved_docs if d.metadata.get('sentiment')] | |
| topic_counts = Counter(topics) | |
| sentiment_counts = Counter(sentiments) | |
| return { | |
| 'topic_distribution': dict(topic_counts), | |
| 'sentiment_distribution': dict(sentiment_counts), | |
| 'total_docs': len(retrieved_docs) | |
| } | |
| # ============================================ | |
| # LOAD MODELS AND DATA | |
| # ============================================ | |
| print("๐ Loading models and data...") | |
| # Load embedding model (CPU) | |
| embedding_model = HuggingFaceEmbeddings( | |
| model_name="all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'} | |
| ) | |
| # Initialize OpenAI client | |
| openai_api_key = os.getenv("OPENAI_API_KEY") | |
| if not openai_api_key: | |
| raise ValueError("OPENAI_API_KEY environment variable not set!") | |
| client = OpenAI(api_key=openai_api_key) | |
| # Load data | |
| print("๐ Loading review data...") | |
| df = pd.read_csv("data/review_dataset.csv") | |
| documents = build_documents(df) | |
| chunks = chunk_documents(documents) | |
| # Build vector store once | |
| print(f"๐ Creating vector store from {len(chunks)} chunks...") | |
| vectorstore = FAISS.from_documents(chunks, embedding_model) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 100}) | |
| print("โ System ready!") | |
| # ============================================ | |
| # RAG FUNCTIONS | |
| # ============================================ | |
| # Cache date-filtered vector stores | |
| date_filtered_stores = {} | |
| def get_or_create_filtered_store(start, end, filtered_chunks): | |
| """Cache filtered vector stores by date range""" | |
| cache_key = f"{start}_{end}" | |
| if cache_key not in date_filtered_stores: | |
| date_filtered_stores[cache_key] = FAISS.from_documents(filtered_chunks, embedding_model) | |
| return date_filtered_stores[cache_key] | |
| def answer_question_with_metadata(question, show_metadata=True, use_metadata_in_prompt=True): | |
| """Answer question using RAG with GPT-4o-mini""" | |
| if not question or not question.strip(): | |
| return "โ ๏ธ Please enter a question.", "" | |
| try: | |
| # Parse date filter | |
| start, end = interpret_date_filter_from_question(question) | |
| # Filter chunks by date if applicable | |
| filtered_chunks = filter_chunks_by_date(chunks, start, end) | |
| if not filtered_chunks: | |
| return "โ No reviews found for this date range.", "" | |
| # Retrieve relevant documents | |
| if start is not None: | |
| temp_vectorstore = get_or_create_filtered_store(start, end, filtered_chunks) | |
| temp_retriever = temp_vectorstore.as_retriever(search_kwargs={"k": 8}) | |
| retrieved_docs = temp_retriever.get_relevant_documents(question) | |
| else: | |
| retrieved_docs = retriever.get_relevant_documents(question) | |
| if not retrieved_docs: | |
| return "โ No relevant reviews found for your question.", "" | |
| # Analyze metadata | |
| metadata_stats = analyze_metadata(retrieved_docs) | |
| # All Reviews | |
| # metadata_stats = analyze_metadata(chunks) | |
| # Build context with or without metadata | |
| context_parts = [] | |
| for i, d in enumerate(retrieved_docs[:6]): # Limit to 6 for context | |
| if use_metadata_in_prompt: | |
| sentiment = d.metadata.get('sentiment', 'N/A') | |
| topic = d.metadata.get('topic', 'N/A') | |
| context_parts.append( | |
| f"Review {i+1} [Topic: {topic}, Sentiment: {sentiment}]: {d.page_content[:250]}" | |
| ) | |
| else: | |
| context_parts.append( | |
| f"Review {i+1}: {d.page_content[:300]}" | |
| ) | |
| context = "\n\n".join(context_parts) | |
| # Create system and user messages for OpenAI | |
| if use_metadata_in_prompt: | |
| system_message = """You are an expert game review analyst. Each review includes Topic and Sentiment labels that provide important context about what players are discussing and their overall feeling. | |
| Analyze the reviews considering both the content and the metadata (topics and sentiments). Provide a structured analysis with: | |
| 1. **Key Praises** (3-4 bullet points highlighting what players love) | |
| 2. **Main Complaints** (3-4 bullet points covering the biggest issues) | |
| 3. **Topic Insights** (mention which topics appear most frequently and any patterns) | |
| 4. **Brief Summary** (2-3 sentences with actionable takeaways) | |
| Keep your response concise, specific, and actionable.""" | |
| user_message = f"""Question: {question} | |
| Reviews to analyze: | |
| {context} | |
| Please provide your analysis following the structure outlined.""" | |
| else: | |
| system_message = """You are an expert game review analyst. Analyze player reviews and provide clear, actionable insights. | |
| Provide a structured analysis with: | |
| 1. **Key Praises** (3-4 bullet points highlighting what players love) | |
| 2. **Main Complaints** (3-4 bullet points covering the biggest issues) | |
| 3. **Brief Summary** (2-3 sentences with actionable takeaways) | |
| Keep your response concise and specific.""" | |
| user_message = f"""Question: {question} | |
| Reviews to analyze: | |
| {context} | |
| Please provide your analysis following the structure outlined.""" | |
| # Call OpenAI API | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": system_message}, | |
| {"role": "user", "content": user_message} | |
| ], | |
| temperature=0.7, | |
| max_tokens=600, | |
| top_p=0.9 | |
| ) | |
| # Extract the response | |
| answer = response.choices[0].message.content | |
| # Build metadata display | |
| metadata_display = "" | |
| if show_metadata: | |
| metadata_display = "## ๐ Retrieved Reviews Analysis\n\n" | |
| metadata_display += f"**Total reviews analyzed:** {metadata_stats['total_docs']}\n\n" | |
| if metadata_stats['topic_distribution']: | |
| metadata_display += "**Topic Distribution:**\n" | |
| for topic, count in sorted(metadata_stats['topic_distribution'].items(), | |
| key=lambda x: x[1], reverse=True): | |
| percentage = (count / metadata_stats['total_docs']) * 100 | |
| metadata_display += f"- {topic}: {count} ({percentage:.1f}%)\n" | |
| if metadata_stats['sentiment_distribution']: | |
| metadata_display += "\n**Sentiment Distribution:**\n" | |
| for sentiment, count in sorted(metadata_stats['sentiment_distribution'].items(), | |
| key=lambda x: x[1], reverse=True): | |
| percentage = (count / metadata_stats['total_docs']) * 100 | |
| metadata_display += f"- {sentiment}: {count} ({percentage:.1f}%)\n" | |
| # Add API usage info | |
| metadata_display += f"\n\n**API Usage:**\n" | |
| metadata_display += f"- Tokens used: {response.usage.total_tokens}\n" | |
| metadata_display += f"- Model: {response.model}\n" | |
| return answer, metadata_display | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"โ Error: {str(e)}\n\n{traceback.format_exc()}" | |
| return error_msg, "" | |
| # ============================================ | |
| # GRADIO UI | |
| # ============================================ | |
| custom_css = """ #component-0 {max-width: 90%; margin: auto; padding: 2%;} .contain {max-height: 80vh; overflow-y: auto;} """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo: | |
| gr.Markdown(""" | |
| # ๐ฎ Game Review Analysis System | |
| ### Powered by GPT-4o-mini with Topic & Sentiment Intelligence | |
| Ask questions about player feedback and compare results with/without metadata! โก **Lightning fast API responses** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| question_input = gr.Textbox( | |
| label="๐ฌ Ask a question about player reviews", | |
| placeholder="e.g., What do players say about gameplay in the first week?", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| show_metadata_checkbox = gr.Checkbox( | |
| label="๐ Show metadata statistics", | |
| value=True | |
| ) | |
| use_metadata_checkbox = gr.Checkbox( | |
| label="๐ท๏ธ Use topic/sentiment in analysis", | |
| value=True, | |
| info="Toggle to compare with/without metadata" | |
| ) | |
| submit_btn = gr.Button("๐ Analyze Reviews", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| answer_output = gr.Textbox( | |
| label="๐ Analysis Results", | |
| lines=16, | |
| show_copy_button=True | |
| ) | |
| with gr.Column(scale=1): | |
| metadata_output = gr.Markdown(value="") | |
| with gr.Accordion("โน๏ธ How to Verify Topic Contribution", open=False): | |
| gr.Markdown(""" | |
| ### ๐ฌ Verification Methods: | |
| **Method 1: Direct A/B Comparison** | |
| 1. โ Enable "Use topic/sentiment in analysis" | |
| 2. Ask a question and note the results | |
| 3. โ Disable "Use topic/sentiment in analysis" | |
| 4. Ask the same question again | |
| 5. Compare: WITH metadata should show topic-specific insights | |
| **Method 2: Check Metadata Panel** | |
| - Look at the right panel showing topic/sentiment distribution | |
| - Diverse topics = Your classification is working | |
| - These labels are passed to GPT-4o-mini for analysis | |
| **Method 3: Topic-Focused Questions** | |
| - Try: "What issues are mentioned about **performance**?" | |
| - Try: "What do players say about **graphics**?" | |
| - With metadata: More focused, categorized results | |
| ### ๐ What the Model Sees: | |
| - **Without metadata**: `Review 1: [raw text]` | |
| - **With metadata**: `Review 1 [Topic: Gameplay, Sentiment: Positive]: [raw text]` | |
| The metadata provides context that helps GPT-4o-mini understand and categorize feedback better! | |
| """) | |
| gr.Markdown("---") | |
| gr.Examples( | |
| examples=[ | |
| ["What were the most common praises in the first 30 days?", True, True], | |
| ["What is the overall evaluation of the game?", True, True], | |
| ["What do players complain about most?", True, True], | |
| ["What bugs or technical issues are mentioned?", True, True], | |
| ["What are the positive aspects of gameplay?", True, True] | |
| ], | |
| inputs=[question_input, show_metadata_checkbox, use_metadata_checkbox] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Model:** GPT-4o-mini (OpenAI API) | |
| **Embeddings:** all-MiniLM-L6-v2 | |
| **Response time:** ~2-5 seconds โก | |
| **Benefits:** Faster, more consistent, no GPU needed | |
| """) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=answer_question_with_metadata, | |
| inputs=[question_input, show_metadata_checkbox, use_metadata_checkbox], | |
| outputs=[answer_output, metadata_output] | |
| ) | |
| question_input.submit( | |
| fn=answer_question_with_metadata, | |
| inputs=[question_input, show_metadata_checkbox, use_metadata_checkbox], | |
| outputs=[answer_output, metadata_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=True, | |
| show_error=True | |
| ) |