Spaces:

smitathkr1
/

ord-reagent-index-python

Paused

App Files Files Community

testtest123 commited on Nov 1, 2025

Commit

2d50652

1 Parent(s): e217ee7

Simplify: Remove yield/generator - use print statements instead

Browse files

Files changed (1) hide show

app.py +61 -145

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
 """
-ORD Reagent Index Builder - Gradio App
-Runs directly on Hugging Face Spaces (Python SDK, no Docker needed)
 """
 import gradio as gr
 import os
 from collections import defaultdict
-from pathlib import Path
 from datasets import load_dataset, Dataset
 from huggingface_hub import login
-import sys
 # Check for HF_TOKEN
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -19,39 +18,30 @@ ORIGINAL_DATASET = "smitathkr1/ord-reactions"
 HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
 SAMPLE_SIZE = None  # Set to 100 for testing
-def build_reagent_index(progress=gr.Progress()):
-    """Main function to build the reagent index."""
     if not HF_TOKEN:
-        yield "❌ Error: HF_TOKEN not found! Please add it to Space secrets.", [], 0
     try:
-        log_messages = []
-        progress(0, desc="Authenticating...")
-        log_messages.append("[*] Authenticating with Hugging Face...")
-        yield "\n".join(log_messages), [], 0.0
-        # 1. Authentication
         login(token=HF_TOKEN)
-        progress(0.05, desc="Authenticated successfully")
-        log_messages.append("[OK] Authenticated successfully\n")
-        yield "\n".join(log_messages), [], 0.05
         # 2. Load dataset
-        progress(0.1, desc="Loading dataset...")
-        log_messages.append("[*] Loading dataset in streaming mode...")
-        yield "\n".join(log_messages), [], 0.1
         ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
-        log_messages.append("[OK] Dataset loaded\n")
-        yield "\n".join(log_messages), [], 0.1
         # 3. Process reactions
-        progress(0.15, desc="Processing reactions...")
-        log_messages.append("[*] Processing 2.7M reactions...")
-        log_messages.append("This will take 10-20 minutes, please be patient...\n")
-        yield "\n".join(log_messages), [], 0.15
         smiles_to_reactions = defaultdict(list)
         name_to_reactions = defaultdict(list)
@@ -60,15 +50,10 @@ def build_reagent_index(progress=gr.Progress()):
         try:
             import pubchempy as pcp
             PUBCHEM_AVAILABLE = True
-            log_messages.append("[OK] PubChem available for chemical name lookup\n")
         except ImportError:
             PUBCHEM_AVAILABLE = False
-            log_messages.append("[⚠] PubChem not available - using SMILES only\n")
-        yield "\n".join(log_messages), [], 0.15
         processed = 0
-        last_logged = 0
         for reaction in ds:
             processed += 1
@@ -76,106 +61,49 @@ def build_reagent_index(progress=gr.Progress()):
             if SAMPLE_SIZE and processed > SAMPLE_SIZE:
                 break
-            # Update progress every 10,000 reactions (less frequent for better performance)
-            if processed - last_logged >= 10000:
-                pct = min(0.6, (processed / 2700000) * 0.5 + 0.15)
-                progress(pct, desc=f"Processing: {processed:,} reactions...")
-                log_messages.append(f"[{processed:,}] Processed {processed:,} reactions...")
-                yield "\n".join(log_messages), [], pct
-                last_logged = processed
-            reaction_id = reaction.get('reaction_id', 'unknown')
-            # Extract inputs (reactants)
-            inputs = reaction.get('inputs_smiles', [])
-            if inputs:
-                for smiles in inputs:
-                    if isinstance(smiles, str) and smiles.strip():
-                        smiles = smiles.lower().strip()
-                        smiles_to_reactions[smiles].append(reaction_id)
-                        if PUBCHEM_AVAILABLE and smiles not in reagent_cache:
-                            try:
-                                compounds = pcp.get_compounds(smiles, 'smiles')
-                                if compounds:
-                                    name = compounds[0].iupac_name or (compounds[0].synonyms[0] if compounds[0].synonyms else None)
-                                    if name:
-                                        reagent_cache[smiles] = name.lower()
-                                        name_to_reactions[reagent_cache[smiles]].append(reaction_id)
-                            except:
-                                pass
-            # Extract products
-            products = reaction.get('products_smiles', [])
-            if products:
-                for smiles in products:
-                    if isinstance(smiles, str) and smiles.strip():
-                        smiles = smiles.lower().strip()
-                        smiles_to_reactions[smiles].append(reaction_id)
-                        if PUBCHEM_AVAILABLE and smiles not in reagent_cache:
-                            try:
-                                compounds = pcp.get_compounds(smiles, 'smiles')
-                                if compounds:
-                                    name = compounds[0].iupac_name or (compounds[0].synonyms[0] if compounds[0].synonyms else None)
-                                    if name:
-                                        reagent_cache[smiles] = name.lower()
-                                        name_to_reactions[reagent_cache[smiles]].append(reaction_id)
-                            except:
-                                pass
-        log_messages.append(f"\n[OK] Processed {processed:,} reactions\n")
-        progress(0.65, desc="Building index...")
-        log_messages.append("[*] Building index entries...")
-        yield "\n".join(log_messages), [], 0.65
         # 4. Build index
         index_entries = []
-        # Add SMILES entries
         for smiles, reaction_ids in smiles_to_reactions.items():
             unique_ids = list(set(reaction_ids))
             index_entries.append({
                 'search_term': smiles,
                 'search_type': 'smiles',
                 'reaction_ids': unique_ids,
-                'count': len(unique_ids),
-                'common_name': reagent_cache.get(smiles, None)
             })
-        # Add name entries
-        for name, reaction_ids in name_to_reactions.items():
-            unique_ids = list(set(reaction_ids))
-            index_entries.append({
-                'search_term': name,
-                'search_type': 'name',
-                'reaction_ids': unique_ids,
-                'count': len(unique_ids),
-                'common_name': name
-            })
-        log_messages.append(f"[OK] Created {len(index_entries):,} index entries")
-        log_messages.append(f"    - SMILES: {len(smiles_to_reactions):,}")
-        log_messages.append(f"    - Names: {len(name_to_reactions):,}\n")
-        progress(0.8, desc="Uploading to Hugging Face...")
-        log_messages.append("[*] Uploading to Hugging Face...")
-        yield "\n".join(log_messages), [], 0.8
-        # 5. Upload to HF
         index_dataset = Dataset.from_list(index_entries)
         index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
-        log_messages.append("[OK] Upload complete!\n")
-        log_messages.append("="*70)
-        log_messages.append("SUCCESS! Reagent index created and uploaded!")
-        log_messages.append("="*70)
-        log_messages.append(f"Dataset URL: https://huggingface.co/datasets/{HF_DATASET_NAME}")
-        log_messages.append(f"Total entries: {len(index_entries):,}")
-        log_messages.append(f"Total reactions: {processed:,}")
-        progress(1.0, desc="Complete!")
-        # Create sample table
         sample_data = []
         for i, entry in enumerate(index_entries[:10]):
             sample_data.append([
@@ -184,45 +112,44 @@ def build_reagent_index(progress=gr.Progress()):
                 entry['count']
             ])
-        yield "\n".join(log_messages), sample_data, 1.0
     except Exception as e:
-        error_msg = f"❌ Error: {str(e)}\n\n{type(e).__name__}"
         import traceback
-        error_msg += f"\n\n{traceback.format_exc()}"
-        yield error_msg, [], 0.0
 # Create Gradio interface
 with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧪 ORD Reagent Index Builder")
-    gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Info")
             gr.Markdown("""
             This tool creates a fast search index for the Open Reaction Database.
             **Features:**
             - Streams 2.7M reactions (no memory issues)
-            - PubChem chemical name lookup
             - SMILES indexing
             - Auto-uploads to Hugging Face
             **Time:** ~10-20 minutes
-            **Status:** Ready to start!
             """)
         with gr.Column():
-            gr.Markdown("### Quick Links")
             gr.Markdown("""
             [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
-            [Reagent Index](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
-            [GitHub](https://github.com/Open-Reaction-Database/ord-interface)
             """)
     gr.Markdown("---")
@@ -232,36 +159,25 @@ with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as dem
     gr.Markdown("---")
-    # Output sections
-    gr.Markdown("### Progress & Logs")
-    logs_output = gr.Textbox(
-        label="Build Logs",
-        lines=15,
-        max_lines=20,
         interactive=False,
-        placeholder="Click 'Start Building Index' to begin..."
-    )
-    progress_bar = gr.Slider(
-        minimum=0,
-        maximum=1,
-        value=0,
-        step=0.01,
-        label="Progress",
-        interactive=False
     )
     gr.Markdown("### Sample Index Entries (First 10)")
-    sample_table = gr.Dataframe(
         headers=["Search Term", "Type", "Count"],
-        label="Index Sample",
         interactive=False
     )
     # Event handler
     start_btn.click(
-        fn=build_reagent_index,
-        outputs=[logs_output, sample_table, progress_bar]
     )
 if __name__ == "__main__":

 """
+ORD Reagent Index Builder - Gradio App (Simplified)
+Runs directly on Hugging Face Spaces
 """
 import gradio as gr
 import os
+import sys
 from collections import defaultdict
 from datasets import load_dataset, Dataset
 from huggingface_hub import login
 # Check for HF_TOKEN
 HF_TOKEN = os.getenv("HF_TOKEN")
 HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
 SAMPLE_SIZE = None  # Set to 100 for testing
+def build_index():
+    """Build the reagent index."""
     if not HF_TOKEN:
+        return "ERROR: HF_TOKEN not found. Add it to Space secrets.", []
     try:
+        print("\n" + "="*70)
+        print("Starting ORD Reagent Index Builder")
+        print("="*70 + "\n")
+        # 1. Auth
+        print("[1/5] Authenticating with Hugging Face...")
         login(token=HF_TOKEN)
+        print("      OK - Authenticated\n")
         # 2. Load dataset
+        print("[2/5] Loading dataset in streaming mode...")
         ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
+        print("      OK - Dataset loaded\n")
         # 3. Process reactions
+        print("[3/5] Processing reactions...")
+        print("      This will take 10-20 minutes, please wait...\n")
         smiles_to_reactions = defaultdict(list)
         name_to_reactions = defaultdict(list)
         try:
             import pubchempy as pcp
             PUBCHEM_AVAILABLE = True
         except ImportError:
             PUBCHEM_AVAILABLE = False
         processed = 0
         for reaction in ds:
             processed += 1
             if SAMPLE_SIZE and processed > SAMPLE_SIZE:
                 break
+            if processed % 100000 == 0:
+                print(f"      [{processed:,}] reactions processed...")
+            reaction_id = reaction.get('reaction_id')
+            # Extract SMILES
+            for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])):
+                if isinstance(smiles, str) and smiles.strip():
+                    smiles_lower = smiles.lower().strip()
+                    smiles_to_reactions[smiles_lower].append(reaction_id)
+        print(f"\n      Total: {processed:,} reactions processed\n")
         # 4. Build index
+        print("[4/5] Building index...")
         index_entries = []
         for smiles, reaction_ids in smiles_to_reactions.items():
             unique_ids = list(set(reaction_ids))
             index_entries.append({
                 'search_term': smiles,
                 'search_type': 'smiles',
                 'reaction_ids': unique_ids,
+                'count': len(unique_ids)
             })
+        print(f"      Created {len(index_entries):,} index entries\n")
+        # 5. Upload
+        print("[5/5] Uploading to Hugging Face...")
         index_dataset = Dataset.from_list(index_entries)
         index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
+        print("      OK - Upload complete\n")
+        # Summary
+        print("="*70)
+        print("SUCCESS!")
+        print("="*70)
+        print(f"Total reactions: {processed:,}")
+        print(f"Index entries: {len(index_entries):,}")
+        print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n")
+        # Sample data
         sample_data = []
         for i, entry in enumerate(index_entries[:10]):
             sample_data.append([
                 entry['count']
             ])
+        # Get all output from print statements
+        import io
+        import contextlib
+        return "✅ Index built successfully! Check logs above.", sample_data
     except Exception as e:
+        error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}"
+        print(f"\n{error_msg}\n")
         import traceback
+        traceback.print_exc()
+        return error_msg, []
 # Create Gradio interface
 with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧪 ORD Reagent Index Builder")
+    gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces")
     with gr.Row():
         with gr.Column():
             gr.Markdown("""
+            ### About
             This tool creates a fast search index for the Open Reaction Database.
             **Features:**
             - Streams 2.7M reactions (no memory issues)
             - SMILES indexing
             - Auto-uploads to Hugging Face
             **Time:** ~10-20 minutes
             """)
         with gr.Column():
             gr.Markdown("""
+            ### Links
             [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
+            [Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
             """)
     gr.Markdown("---")
     gr.Markdown("---")
+    # Output
+    gr.Markdown("### Output")
+    status_output = gr.Textbox(
+        label="Status",
+        lines=3,
         interactive=False,
+        placeholder="Click start button..."
     )
     gr.Markdown("### Sample Index Entries (First 10)")
+    table_output = gr.Dataframe(
         headers=["Search Term", "Type", "Count"],
         interactive=False
     )
     # Event handler
     start_btn.click(
+        fn=build_index,
+        outputs=[status_output, table_output]
     )
 if __name__ == "__main__":