""" ORD Reagent Index Builder - Gradio App (Simplified) Runs directly on Hugging Face Spaces """ import gradio as gr import os import sys from collections import defaultdict from datasets import load_dataset, Dataset from huggingface_hub import login # Check for HF_TOKEN HF_TOKEN = os.getenv("HF_TOKEN") # Configuration ORIGINAL_DATASET = "smitathkr1/ord-reactions" HF_DATASET_NAME = "smitathkr1/ord-reagent-index" SAMPLE_SIZE = None # Set to 100 for testing def build_index(): """Build the reagent index.""" if not HF_TOKEN: return "ERROR: HF_TOKEN not found. Add it to Space secrets.", [] try: print("\n" + "="*70) print("Starting ORD Reagent Index Builder") print("="*70 + "\n") # 1. Auth print("[1/5] Authenticating with Hugging Face...") login(token=HF_TOKEN) print(" OK - Authenticated\n") # 2. Load dataset print("[2/5] Loading dataset in streaming mode...") ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True) print(" OK - Dataset loaded\n") # 3. Process reactions print("[3/5] Processing reactions...") print(" This will take 10-20 minutes, please wait...\n") smiles_to_reactions = defaultdict(list) name_to_reactions = defaultdict(list) reagent_cache = {} try: import pubchempy as pcp PUBCHEM_AVAILABLE = True except ImportError: PUBCHEM_AVAILABLE = False processed = 0 for reaction in ds: processed += 1 if SAMPLE_SIZE and processed > SAMPLE_SIZE: break if processed % 100000 == 0: print(f" [{processed:,}] reactions processed...") reaction_id = reaction.get('reaction_id') # Extract SMILES for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])): if isinstance(smiles, str) and smiles.strip(): smiles_lower = smiles.lower().strip() smiles_to_reactions[smiles_lower].append(reaction_id) print(f"\n Total: {processed:,} reactions processed\n") # 4. Build index print("[4/5] Building index...") index_entries = [] for smiles, reaction_ids in smiles_to_reactions.items(): unique_ids = list(set(reaction_ids)) index_entries.append({ 'search_term': smiles, 'search_type': 'smiles', 'reaction_ids': unique_ids, 'count': len(unique_ids) }) print(f" Created {len(index_entries):,} index entries\n") # 5. Upload print("[5/5] Uploading to Hugging Face...") index_dataset = Dataset.from_list(index_entries) index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN) print(" OK - Upload complete\n") # Summary print("="*70) print("SUCCESS!") print("="*70) print(f"Total reactions: {processed:,}") print(f"Index entries: {len(index_entries):,}") print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n") # Sample data sample_data = [] for i, entry in enumerate(index_entries[:10]): sample_data.append([ entry['search_term'][:30], entry['search_type'], entry['count'] ]) # Get all output from print statements import io import contextlib return "โœ… Index built successfully! Check logs above.", sample_data except Exception as e: error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}" print(f"\n{error_msg}\n") import traceback traceback.print_exc() return error_msg, [] # Create Gradio interface with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo: gr.Markdown("# ๐Ÿงช ORD Reagent Index Builder") gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces") with gr.Row(): with gr.Column(): gr.Markdown(""" ### About This tool creates a fast search index for the Open Reaction Database. **Features:** - Streams 2.7M reactions (no memory issues) - SMILES indexing - Auto-uploads to Hugging Face **Time:** ~10-20 minutes """) with gr.Column(): gr.Markdown(""" ### Links [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions) [Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index) """) gr.Markdown("---") # Main button start_btn = gr.Button("๐Ÿš€ Start Building Index", size="lg", variant="primary") gr.Markdown("---") # Output gr.Markdown("### Output") status_output = gr.Textbox( label="Status", lines=3, interactive=False, placeholder="Click start button..." ) gr.Markdown("### Sample Index Entries (First 10)") table_output = gr.Dataframe( headers=["Search Term", "Type", "Count"], interactive=False ) # Event handler start_btn.click( fn=build_index, outputs=[status_output, table_output] ) if __name__ == "__main__": demo.launch(share=False)