Spaces:
Build error
Build error
| import pandas as pd | |
| from pathlib import Path | |
| from typing import List, Dict, Tuple | |
| import random | |
| def load_hindi_dataset(base_path: str = "data", split: str = "train", num_files: int = None) -> str: | |
| """ | |
| Load Hindi text from dataset with train/validation split structure. | |
| Args: | |
| base_path: Base directory containing train and validation folders | |
| split: Either 'train' or 'valid' | |
| num_files: Number of files to load (None for all files) | |
| """ | |
| base_dir = Path(base_path) | |
| split_dir = base_dir / split / split | |
| if not split_dir.exists(): | |
| raise FileNotFoundError(f"Directory not found: {split_dir}") | |
| print(f"\nLoading Hindi dataset from {split_dir}") | |
| # Get all txt files in the directory | |
| txt_files = list(split_dir.glob("*.txt")) | |
| if not txt_files: | |
| raise FileNotFoundError(f"No txt files found in {split_dir}") | |
| # Sort files by word count (assuming filenames contain word counts) | |
| txt_files.sort(key=lambda x: int(x.stem)) | |
| # Sample files if num_files is specified | |
| if num_files is not None: | |
| if num_files < len(txt_files): | |
| txt_files = random.sample(txt_files, num_files) | |
| print(f"Found {len(txt_files)} files") | |
| # Load and combine text from files | |
| texts = [] | |
| total_chars = 0 | |
| total_words = 0 | |
| for idx, file_path in enumerate(txt_files, 1): | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text = f.read().strip() | |
| word_count = int(file_path.stem) # Filename is word count | |
| texts.append(text) | |
| total_chars += len(text) | |
| total_words += word_count | |
| if idx % 10 == 0: | |
| print(f"Processed {idx}/{len(txt_files)} files. " | |
| f"Total characters: {total_chars:,}, " | |
| f"Total words: {total_words:,}") | |
| except Exception as e: | |
| print(f"Error reading file {file_path}: {e}") | |
| continue | |
| combined_text = "\n\n".join(texts) | |
| print(f"\nDataset loading completed:") | |
| print(f"Total files: {len(texts)}") | |
| print(f"Total characters: {len(combined_text):,}") | |
| print(f"Total words: {total_words:,}") | |
| print(f"Average words per file: {total_words/len(texts):,.1f}") | |
| return combined_text | |
| def get_dataset_stats(base_path: str = "data") -> Dict: | |
| """Get statistics about the dataset.""" | |
| stats = {} | |
| for split in ['train', 'valid']: | |
| split_dir = Path(base_path) / split | |
| if split_dir.exists(): | |
| txt_files = list(split_dir.glob("*.txt")) | |
| word_counts = [int(f.stem) for f in txt_files] | |
| stats[split] = { | |
| 'num_files': len(txt_files), | |
| 'total_words': sum(word_counts), | |
| 'min_words': min(word_counts) if word_counts else 0, | |
| 'max_words': max(word_counts) if word_counts else 0, | |
| 'avg_words': sum(word_counts)/len(word_counts) if word_counts else 0 | |
| } | |
| return stats | |
| def load_train_valid_split(base_path: str = "data", | |
| train_files: int = None, | |
| valid_files: int = None) -> Tuple[str, str]: | |
| """Load both train and validation splits.""" | |
| train_text = load_hindi_dataset(base_path, "train", train_files) | |
| valid_text = load_hindi_dataset(base_path, "valid", valid_files) | |
| return train_text, valid_text | |
| if __name__ == "__main__": | |
| # Print dataset statistics | |
| stats = get_dataset_stats() | |
| print("\nDataset Statistics:") | |
| print("-" * 50) | |
| for split, split_stats in stats.items(): | |
| print(f"\n{split.upper()} Split:") | |
| for key, value in split_stats.items(): | |
| if isinstance(value, (int, float)): | |
| print(f"{key}: {value:,}") | |
| else: | |
| print(f"{key}: {value}") | |
| # Load sample data | |
| print("\nLoading sample data...") | |
| train_text, valid_text = load_train_valid_split(train_files=5, valid_files=2) | |
| print(f"\nSample train text (first 200 chars):\n{train_text[:200]}") | |
| print(f"\nSample valid text (first 200 chars):\n{valid_text[:200]}") |