Spaces:
Sleeping
Sleeping
| import logging | |
| import os | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| from datasets import Dataset, Features, Value | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfApi | |
| # Load environment variables | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Logging setup | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout), | |
| logging.FileHandler('debug_upload.log', mode='w') | |
| ] | |
| ) | |
| REPO_ID = "Allanatrix/Scientific_Research_Tokenized" | |
| JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl") | |
| ARROW_PATH = Path("scientific_corpus_325M.arrow") | |
| README_PATH = Path("README.md") | |
| def debug_jsonl_head(jsonl_path, n=5): | |
| logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:") | |
| try: | |
| with open(jsonl_path, "r", encoding="utf-8") as f: | |
| for i in range(n): | |
| line = f.readline() | |
| if not line: | |
| break | |
| logging.info(f"Line {i+1}: {line.strip()}") | |
| except Exception as e: | |
| logging.error(f"Failed to read JSONL head: {e}") | |
| def infer_features_from_sample(jsonl_path, n=100): | |
| import json | |
| from collections import defaultdict | |
| types = defaultdict(set) | |
| try: | |
| with open(jsonl_path, "r", encoding="utf-8") as f: | |
| for i, line in enumerate(f): | |
| if i >= n: | |
| break | |
| obj = json.loads(line) | |
| for k, v in obj.items(): | |
| types[k].add(type(v).__name__) | |
| logging.info(f"Inferred field types from first {n} lines: {dict(types)}") | |
| except Exception as e: | |
| logging.error(f"Failed to infer features: {e}") | |
| def convert_jsonl_to_arrow(jsonl_path, arrow_path): | |
| try: | |
| logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...") | |
| if not jsonl_path.exists(): | |
| logging.error(f"JSONL source file does not exist: {jsonl_path}") | |
| print(f"\nβ JSONL source file does not exist: {jsonl_path}") | |
| raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}") | |
| logging.info(f"File size: {jsonl_path.stat().st_size} bytes") | |
| debug_jsonl_head(jsonl_path, n=5) | |
| infer_features_from_sample(jsonl_path, n=100) | |
| # Try loading a small sample first for debugging | |
| try: | |
| sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]") | |
| logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") | |
| except Exception as sample_e: | |
| logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True) | |
| print(f"\nβ Failed to load sample from JSONL. See debug_upload.log for details.") | |
| # Try to load with explicit features if possible | |
| # Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) | |
| # Uncomment and adjust the following lines if you know the schema: | |
| # features = Features({'url': Value('string'), 'pubmed_id': Value('string')}) | |
| # try: | |
| # sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features) | |
| # logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}") | |
| # except Exception as e2: | |
| # logging.error(f"Still failed with explicit features: {e2}", exc_info=True) | |
| raise | |
| # Now load the full dataset | |
| dataset = Dataset.from_json(str(jsonl_path)) | |
| logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}") | |
| dataset.to_file(str(arrow_path)) | |
| logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.") | |
| return dataset | |
| except Exception as e: | |
| logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True) | |
| print(f"\nβ Failed to convert JSONL to Arrow. See debug_upload.log for details.") | |
| raise | |
| def create_readme(dataset): | |
| content = f"""# Scientific Research Tokenized Dataset | |
| - **Examples**: {len(dataset):,} | |
| - **Columns**: {dataset.column_names} | |
| - **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| ## Usage | |
| ```python | |
| from datasets import load_dataset | |
| ds = load_dataset("{REPO_ID}") | |
| ``` | |
| """ | |
| with open(README_PATH, "w", encoding="utf-8") as f: | |
| f.write(content) | |
| logging.info("README.md created.") | |
| def upload_to_hf(): | |
| api = HfApi() | |
| logging.info("Uploading Arrow file to HuggingFace Hub ...") | |
| api.upload_file( | |
| path_or_fileobj=str(ARROW_PATH), | |
| path_in_repo=ARROW_PATH.name, | |
| repo_id=REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message="Upload Arrow dataset" | |
| ) | |
| logging.info("Uploading README.md to HuggingFace Hub ...") | |
| api.upload_file( | |
| path_or_fileobj=str(README_PATH), | |
| path_in_repo="README.md", | |
| repo_id=REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message="Update README" | |
| ) | |
| logging.info("Upload complete.") | |
| def upload_to_huggingface(*args, **kwargs): | |
| """Alias for upload_to_hf to match expected import in Main_2.py""" | |
| return upload_to_hf(*args, **kwargs) | |
| def cleanup(): | |
| if ARROW_PATH.exists(): | |
| ARROW_PATH.unlink() | |
| if README_PATH.exists(): | |
| README_PATH.unlink() | |
| logging.info("Cleaned up local files.") | |
| def main(): | |
| try: | |
| if not HF_TOKEN: | |
| print("β HF_TOKEN not found in environment. Please set it in your .env file.") | |
| return | |
| dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH) | |
| create_readme(dataset) | |
| upload_to_hf() | |
| print(f"\nπ SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}") | |
| except Exception as e: | |
| logging.error(f"Process failed: {e}") | |
| print(f"\nβ Upload failed. See debug_upload.log for details.") | |
| sys.exit(1) | |
| finally: | |
| cleanup() | |
| if __name__ == "__main__": | |
| main() |