gita_krishna_bot / data_loader.py
Kartheek Akella
Initial Working Commit
9e4c237
import os
import pandas as pd
from datasets import load_dataset
from typing import List, Dict, Any
import pickle
from pathlib import Path
class BhagavadGitaDataLoader:
def __init__(self, cache_dir: str = "cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.data_cache_file = self.cache_dir / "bhagavad_gita_data.pkl"
self.dataset = None
def load_dataset(self, force_refresh: bool = False) -> pd.DataFrame:
if not force_refresh and self.data_cache_file.exists():
print("Loading cached dataset...")
with open(self.data_cache_file, 'rb') as f:
self.dataset = pickle.load(f)
return self.dataset
print("Downloading dataset from HuggingFace...")
dataset = load_dataset("JDhruv14/Bhagavad-Gita_Dataset")
df = pd.DataFrame(dataset['train'])
df = df.rename(columns={
'chapter': 'chapter_num',
'verse': 'verse_num',
'sanskrit': 'sanskrit_text',
'hindi': 'hindi_text',
'english': 'english_text'
})
df['verse_id'] = df['chapter_num'].astype(str) + '.' + df['verse_num'].astype(str)
df['combined_text'] = df['english_text'] + ' ' + df['sanskrit_text']
with open(self.data_cache_file, 'wb') as f:
pickle.dump(df, f)
self.dataset = df
return df
def get_verse_by_id(self, verse_id: str) -> Dict[str, Any]:
if self.dataset is None:
self.load_dataset()
verse_row = self.dataset[self.dataset['verse_id'] == verse_id]
if verse_row.empty:
return None
return verse_row.iloc[0].to_dict()
def get_verses_by_chapter(self, chapter_num: int) -> List[Dict[str, Any]]:
if self.dataset is None:
self.load_dataset()
chapter_verses = self.dataset[self.dataset['chapter_num'] == chapter_num]
return chapter_verses.to_dict('records')
def search_verses(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]:
if self.dataset is None:
self.load_dataset()
query_lower = query_text.lower()
matches = []
for _, row in self.dataset.iterrows():
english_text = row['english_text'].lower()
if any(word in english_text for word in query_lower.split()):
score = sum(1 for word in query_lower.split() if word in english_text)
matches.append((score, row.to_dict()))
matches.sort(key=lambda x: x[0], reverse=True)
return [match[1] for match in matches[:top_k]]
def get_all_verses(self) -> pd.DataFrame:
if self.dataset is None:
self.load_dataset()
return self.dataset