print("=== golf_swing_rag.py Import Debug ===") # Initialize all variables to None first pd = None np = None faiss = None SentenceTransformer = None st = None openai = None load_dotenv = None os = None json = None pickle = None List = None Dict = None Tuple = None re = None datetime = None try: print("Importing pandas...") import pandas as pd print("✓ pandas imported successfully") except ImportError as e: print(f"✗ pandas import failed: {e}") try: print("Importing numpy...") import numpy as np print("✓ numpy imported successfully") except ImportError as e: print(f"✗ numpy import failed: {e}") try: print("Importing faiss...") import faiss print("✓ faiss imported successfully") except ImportError as e: print(f"✗ faiss import failed: {e}") try: print("Importing sentence_transformers...") from sentence_transformers import SentenceTransformer print("✓ sentence_transformers imported successfully") except ImportError as e: print(f"✗ sentence_transformers import failed: {e}") print("Trying alternative sentence_transformers import methods...") # Try importing the package first try: import sentence_transformers print("✓ sentence_transformers package available") from sentence_transformers import SentenceTransformer print("✓ SentenceTransformer class imported successfully") except ImportError as e2: print(f"✗ Alternative sentence_transformers import failed: {e2}") # Print more detailed error information try: import pkg_resources installed_packages = [d.project_name for d in pkg_resources.working_set] if 'sentence-transformers' in installed_packages: print("✓ sentence-transformers package is installed") else: print("✗ sentence-transformers package not found in installed packages") except: pass try: print("Importing streamlit...") import streamlit as st print("✓ streamlit imported successfully") except ImportError as e: print(f"✗ streamlit import failed: {e}") try: print("Importing openai...") import openai print("✓ openai imported successfully") except ImportError as e: print(f"✗ openai import failed: {e}") try: print("Importing dotenv...") from dotenv import load_dotenv print("✓ dotenv imported successfully") except ImportError as e: print(f"✗ dotenv import failed: {e}") try: print("Importing os...") import os print("✓ os imported successfully") except ImportError as e: print(f"✗ os import failed: {e}") try: print("Importing json...") import json print("✓ json imported successfully") except ImportError as e: print(f"✗ json import failed: {e}") try: print("Importing pickle...") import pickle print("✓ pickle imported successfully") except ImportError as e: print(f"✗ pickle import failed: {e}") try: print("Importing typing...") from typing import List, Dict, Tuple print("✓ typing imported successfully") except ImportError as e: print(f"✗ typing import failed: {e}") try: print("Importing re...") import re print("✓ re imported successfully") except ImportError as e: print(f"✗ re import failed: {e}") try: print("Importing datetime...") from datetime import datetime print("✓ datetime imported successfully") except ImportError as e: print(f"✗ datetime import failed: {e}") print("=== End golf_swing_rag.py Import Debug ===") # Check if critical dependencies are available missing_deps = [] if pd is None: missing_deps.append("pandas") if np is None: missing_deps.append("numpy") if faiss is None: missing_deps.append("faiss") if SentenceTransformer is None: missing_deps.append("sentence_transformers") if st is None: missing_deps.append("streamlit") if openai is None: missing_deps.append("openai") if os is None: missing_deps.append("os") if missing_deps: print(f"✗ Critical dependencies missing: {missing_deps}") raise ImportError(f"Missing required dependencies: {', '.join(missing_deps)}") else: print("✓ All critical dependencies available") print("") # Load environment variables if available if load_dotenv: load_dotenv() class GolfSwingRAG: def __init__(self, csv_file_path: str = None): """Initialize the Golf Swing RAG system""" print("=== GolfSwingRAG Initialization Debug ===") print(f"Current working directory: {os.getcwd()}") print(f"__file__ location: {__file__}") print(f"Directory of this file: {os.path.dirname(__file__)}") # Set default CSV path based on current working directory if csv_file_path is None: # Try multiple possible locations possible_paths = [ "golf_swing_articles_complete.csv", # Same directory "../golf_swing_articles_complete.csv", # Parent directory "../../golf_swing_articles_complete.csv", # Grandparent directory "/app/golf_swing_articles_complete.csv", # Absolute path for Hugging Face "/tmp/golf_swing_articles_complete.csv", # Alternative location os.path.join(os.path.dirname(__file__), "..", "golf_swing_articles_complete.csv"), # Relative to script ] csv_file_path = None for path in possible_paths: print(f"Checking for CSV at: {path}") if os.path.exists(path): csv_file_path = path print(f"✓ Found CSV at: {path}") break else: print(f"✗ Not found at: {path}") if csv_file_path is None: print("✗ CSV file not found in any expected location!") print(f"Files in current directory: {os.listdir('.')}") if os.path.exists(".."): print(f"Files in parent directory: {os.listdir('..')}") raise FileNotFoundError("golf_swing_articles_complete.csv not found in any expected location") print(f"Using CSV file: {csv_file_path}") self.csv_file_path = csv_file_path print("Initializing SentenceTransformer...") self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') print("✓ SentenceTransformer initialized") self.index = None self.chunks = [] self.metadata = [] self.openai_client = None # Initialize OpenAI client - prioritize environment variables for Hugging Face deployment print("Initializing OpenAI client...") openai_key = None # Try multiple methods to get the OpenAI API key # Method 1: Environment variable (primary for HF Spaces) try: openai_key = os.getenv("OPENAI_API_KEY", "") if openai_key: print("✓ Found OpenAI key in environment variable OPENAI_API_KEY") else: print("No key found in environment variable OPENAI_API_KEY") except Exception as e: print(f"Error accessing environment variable OPENAI_API_KEY: {e}") # Method 2: Direct access to OPENAI_API_KEY secret (fallback for Streamlit) if not openai_key: try: openai_key = st.secrets.get("OPENAI_API_KEY", "") if openai_key: print("✓ Found OpenAI key in st.secrets['OPENAI_API_KEY']") else: print("No key found in st.secrets['OPENAI_API_KEY']") except Exception as e: print(f"Error accessing st.secrets['OPENAI_API_KEY']: {e}") # Method 3: Try nested openai structure (fallback) if not openai_key: try: openai_key = st.secrets.get("openai", {}).get("api_key", "") if openai_key: print("✓ Found OpenAI key in nested st.secrets['openai']['api_key']") else: print("No key found in st.secrets['openai']['api_key']") except Exception as e: print(f"Error accessing nested openai secrets: {e}") # Initialize client if we found a key if openai_key and openai_key.startswith("sk-"): try: # Simple initialization without extra parameters that might cause conflicts self.openai_client = openai.OpenAI(api_key=openai_key) print("✓ OpenAI client initialized successfully") # Test the client with a simple request to verify it works try: # Make a minimal test request test_response = self.openai_client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": "Hi"}], max_tokens=5 ) print("✓ OpenAI client test successful") except Exception as test_e: print(f"⚠️ OpenAI client test failed: {test_e}") # Try with a different model if gpt-4o-mini fails try: test_response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi"}], max_tokens=5 ) print("✓ OpenAI client test successful with gpt-3.5-turbo") except Exception as test_e2: print(f"⚠️ OpenAI client test failed with both models: {test_e2}") self.openai_client = None except Exception as e: print(f"✗ Error initializing OpenAI client: {e}") print(f"Error type: {type(e).__name__}") # Try alternative initialization approach try: print("Trying alternative OpenAI client initialization...") # Import OpenAI directly to avoid potential conflicts from openai import OpenAI self.openai_client = OpenAI(api_key=openai_key) print("✓ Alternative OpenAI client initialization successful") # Test the alternative client try: test_response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Test"}], max_tokens=5 ) print("✓ Alternative OpenAI client test successful") except Exception as alt_test_e: print(f"⚠️ Alternative OpenAI client test failed: {alt_test_e}") self.openai_client = None except Exception as alt_e: print(f"✗ Alternative OpenAI client initialization also failed: {alt_e}") self.openai_client = None else: print("✗ No valid OpenAI API key found (should start with 'sk-')") if openai_key: print(f"Found key starting with: {openai_key[:10]}...") self.openai_client = None print("=== End GolfSwingRAG Initialization Debug ===") print("") def load_and_process_data(self): """Load CSV data and process it for RAG""" print("Loading golf swing data...") # Read CSV file df = pd.read_csv(self.csv_file_path) print(f"Loaded {len(df)} articles") # Process each article all_chunks = [] all_metadata = [] for idx, row in df.iterrows(): # Parse text chunks if they exist text_chunks = [] if pd.notna(row['text_chunks']) and row['text_chunks'].strip(): try: # Parse the text_chunks column (it appears to be a list in string format) chunks_str = row['text_chunks'] if chunks_str.startswith('[') and chunks_str.endswith(']'): # Remove brackets and split by quotes chunks_str = chunks_str[1:-1] # Remove outer brackets # Split by quote patterns while preserving content text_chunks = [chunk.strip().strip("'\"") for chunk in chunks_str.split("', '") if chunk.strip()] if not text_chunks and chunks_str: text_chunks = [chunks_str.strip().strip("'\"")] except: # Fallback: use cleaned_text if text_chunks parsing fails text_chunks = [row['cleaned_text']] if pd.notna(row['cleaned_text']) else [] # If no chunks, create chunks from cleaned_text or text if not text_chunks: text_content = row['cleaned_text'] if pd.notna(row['cleaned_text']) else row['text'] if pd.notna(text_content): # Split into chunks of ~500 words words = text_content.split() chunk_size = 500 text_chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] # Add each chunk with metadata for chunk_idx, chunk in enumerate(text_chunks): if chunk and len(chunk.strip()) > 50: # Only process substantial chunks all_chunks.append(chunk) all_metadata.append({ 'title': row['title'], 'url': row['url'], 'source': row['source'], 'publish_date': row['publish_date'], 'authors': row['authors'], 'chunk_index': chunk_idx, 'article_index': idx }) self.chunks = all_chunks self.metadata = all_metadata print(f"Created {len(all_chunks)} text chunks") def create_embeddings(self, force_recreate: bool = False): """Create embeddings for all text chunks with enhanced error handling""" try: # Determine the correct base directory for embeddings files if os.path.exists("golf_swing_articles_complete.csv"): # Running from project root embeddings_file = "golf_swing_embeddings.pkl" index_file = "golf_swing_index.faiss" else: # Running from app directory embeddings_file = "../golf_swing_embeddings.pkl" index_file = "../golf_swing_index.faiss" if not force_recreate and os.path.exists(embeddings_file) and os.path.exists(index_file): print("Loading existing embeddings...") try: with open(embeddings_file, 'rb') as f: data = pickle.load(f) self.chunks = data['chunks'] self.metadata = data['metadata'] self.index = faiss.read_index(index_file) print(f"Loaded {len(self.chunks)} chunks with embeddings") return except Exception as e: print(f"Failed to load existing embeddings: {e}") print("Will create new embeddings...") print("Creating embeddings...") if not self.chunks: self.load_and_process_data() # Reduce batch size to prevent memory issues batch_size = 16 # Reduced from 32 all_embeddings = [] # Add memory management import gc for i in range(0, len(self.chunks), batch_size): try: batch_chunks = self.chunks[i:i+batch_size] print(f"Processing batch {i//batch_size + 1}/{(len(self.chunks) + batch_size - 1)//batch_size}") # Create embeddings with reduced progress bar output batch_embeddings = self.embedding_model.encode( batch_chunks, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True # Normalize during encoding ) all_embeddings.append(batch_embeddings) print(f"Processed {min(i+batch_size, len(self.chunks))}/{len(self.chunks)} chunks") # Force garbage collection after each batch gc.collect() except Exception as e: print(f"Error processing batch {i//batch_size + 1}: {e}") # Continue with next batch instead of failing completely continue if not all_embeddings: raise Exception("Failed to create any embeddings") # Combine all embeddings print("Combining embeddings...") embeddings = np.vstack(all_embeddings) # Create FAISS index with error handling print("Creating FAISS index...") dimension = embeddings.shape[1] # Use simpler FAISS index that's more stable self.index = faiss.IndexFlatL2(dimension) # L2 distance instead of inner product # Add embeddings to index self.index.add(embeddings.astype('float32')) # Save embeddings and index print("Saving embeddings...") try: with open(embeddings_file, 'wb') as f: pickle.dump({ 'chunks': self.chunks, 'metadata': self.metadata }, f) faiss.write_index(self.index, index_file) print(f"Created and saved embeddings for {len(self.chunks)} chunks") except Exception as e: print(f"Failed to save embeddings: {e}") print("Embeddings created but not saved to disk") except Exception as e: print(f"Critical error in create_embeddings: {e}") print("RAG system will operate in limited mode") # Set up minimal fallback self.chunks = self.chunks if hasattr(self, 'chunks') and self.chunks else [] self.metadata = self.metadata if hasattr(self, 'metadata') and self.metadata else [] self.index = None def search_similar_chunks(self, query: str, top_k: int = 5) -> List[Dict]: """Search for similar chunks using semantic similarity with fallback""" try: if self.index is None: print("FAISS index not available, using simple text matching fallback") return self._fallback_search(query, top_k) # Create query embedding query_embedding = self.embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True) # Search in FAISS index (L2 distance, so lower scores are better) scores, indices = self.index.search(query_embedding.astype('float32'), top_k) # Convert results to list format results = [] for i, (score, idx) in enumerate(zip(scores[0], indices[0])): if idx < len(self.chunks): # Ensure valid index results.append({ 'chunk': self.chunks[idx], 'metadata': self.metadata[idx], 'similarity_score': 1.0 / (1.0 + score) # Convert L2 distance to similarity }) return results except Exception as e: print(f"Error in semantic search: {e}") print("Falling back to simple text matching") return self._fallback_search(query, top_k) def _fallback_search(self, query: str, top_k: int = 5) -> List[Dict]: """Simple text-based search fallback when semantic search fails""" if not self.chunks: return [] query_lower = query.lower() query_words = set(query_lower.split()) # Score chunks based on word overlap scored_chunks = [] for i, chunk in enumerate(self.chunks): chunk_lower = chunk.lower() chunk_words = set(chunk_lower.split()) # Calculate simple word overlap score overlap = len(query_words.intersection(chunk_words)) if overlap > 0: score = overlap / len(query_words) scored_chunks.append({ 'chunk': chunk, 'metadata': self.metadata[i] if i < len(self.metadata) else {}, 'similarity_score': score }) # Sort by score and return top_k scored_chunks.sort(key=lambda x: x['similarity_score'], reverse=True) return scored_chunks[:top_k] def generate_response(self, query: str, context_chunks: List[Dict]) -> str: """Generate response using OpenAI API with context""" if not self.openai_client: return self._generate_fallback_response(query, context_chunks) # Prepare context context = "\n\n".join([f"Source: {chunk['metadata']['title']}\nContent: {chunk['chunk']}" for chunk in context_chunks]) # Create system prompt system_prompt = """You are a golf swing technique expert assistant. You help golfers improve their swing by providing detailed, accurate advice based on professional golf instruction content. Instructions: - Answer questions about golf swing technique, mechanics, common problems, and solutions - Provide specific, actionable advice when possible - Reference relevant technical concepts when appropriate - Be encouraging and supportive - If asked about physical limitations or injuries, recommend consulting with a TPI certified professional - Always base your answers on the provided context from golf instruction materials Context from golf instruction database: {context}""" user_prompt = f"""Based on the golf instruction content provided, please answer this question about golf swing technique: Question: {query} Please provide a helpful, detailed response that addresses the specific question while drawing from the relevant information in the context.""" try: response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": system_prompt.format(context=context)}, {"role": "user", "content": user_prompt} ], max_tokens=1000, temperature=0.7 ) return response.choices[0].message.content except Exception as e: print(f"OpenAI API error: {e}") return self._generate_fallback_response(query, context_chunks) def _generate_fallback_response(self, query: str, context_chunks: List[Dict]) -> str: """Generate a fallback response when OpenAI API is not available""" if not context_chunks: return "I couldn't find specific information about that topic in the golf swing database. Could you try rephrasing your question or being more specific?" # Create a simple response based on the most relevant chunk best_chunk = context_chunks[0] chunk_content = best_chunk['chunk'] title = best_chunk['metadata']['title'] response = f"Based on the article '{title}', here's what I found:\n\n" response += chunk_content[:500] + "..." response += f"\n\nFor more detailed information, you can refer to the full article: {title}" return response def query(self, question: str, top_k: int = 5) -> Dict: """Main query method that returns both response and sources""" # Search for relevant chunks relevant_chunks = self.search_similar_chunks(question, top_k) # Generate response response = self.generate_response(question, relevant_chunks) return { 'response': response, 'sources': relevant_chunks, 'query': question, 'timestamp': datetime.now().isoformat() } def main(): """Initialize and test the RAG system""" rag = GolfSwingRAG() rag.load_and_process_data() rag.create_embeddings() # Test query test_query = "What wrist motion happens during the downswing?" result = rag.query(test_query) print(f"Query: {result['query']}") print(f"Response: {result['response']}") print(f"Number of sources: {len(result['sources'])}") if __name__ == "__main__": main()