Spaces:
Runtime error
Runtime error
| import os | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from typing import List, Dict, Any | |
| import pickle | |
| from pathlib import Path | |
| class BhagavadGitaDataLoader: | |
| def __init__(self, cache_dir: str = "cache"): | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(exist_ok=True) | |
| self.data_cache_file = self.cache_dir / "bhagavad_gita_data.pkl" | |
| self.dataset = None | |
| def load_dataset(self, force_refresh: bool = False) -> pd.DataFrame: | |
| if not force_refresh and self.data_cache_file.exists(): | |
| print("Loading cached dataset...") | |
| with open(self.data_cache_file, 'rb') as f: | |
| self.dataset = pickle.load(f) | |
| return self.dataset | |
| print("Downloading dataset from HuggingFace...") | |
| dataset = load_dataset("JDhruv14/Bhagavad-Gita_Dataset") | |
| df = pd.DataFrame(dataset['train']) | |
| df = df.rename(columns={ | |
| 'chapter': 'chapter_num', | |
| 'verse': 'verse_num', | |
| 'sanskrit': 'sanskrit_text', | |
| 'hindi': 'hindi_text', | |
| 'english': 'english_text' | |
| }) | |
| df['verse_id'] = df['chapter_num'].astype(str) + '.' + df['verse_num'].astype(str) | |
| df['combined_text'] = df['english_text'] + ' ' + df['sanskrit_text'] | |
| with open(self.data_cache_file, 'wb') as f: | |
| pickle.dump(df, f) | |
| self.dataset = df | |
| return df | |
| def get_verse_by_id(self, verse_id: str) -> Dict[str, Any]: | |
| if self.dataset is None: | |
| self.load_dataset() | |
| verse_row = self.dataset[self.dataset['verse_id'] == verse_id] | |
| if verse_row.empty: | |
| return None | |
| return verse_row.iloc[0].to_dict() | |
| def get_verses_by_chapter(self, chapter_num: int) -> List[Dict[str, Any]]: | |
| if self.dataset is None: | |
| self.load_dataset() | |
| chapter_verses = self.dataset[self.dataset['chapter_num'] == chapter_num] | |
| return chapter_verses.to_dict('records') | |
| def search_verses(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]: | |
| if self.dataset is None: | |
| self.load_dataset() | |
| query_lower = query_text.lower() | |
| matches = [] | |
| for _, row in self.dataset.iterrows(): | |
| english_text = row['english_text'].lower() | |
| if any(word in english_text for word in query_lower.split()): | |
| score = sum(1 for word in query_lower.split() if word in english_text) | |
| matches.append((score, row.to_dict())) | |
| matches.sort(key=lambda x: x[0], reverse=True) | |
| return [match[1] for match in matches[:top_k]] | |
| def get_all_verses(self) -> pd.DataFrame: | |
| if self.dataset is None: | |
| self.load_dataset() | |
| return self.dataset |