testtest123's picture
Simplify: Remove yield/generator - use print statements instead
2d50652
"""
ORD Reagent Index Builder - Gradio App (Simplified)
Runs directly on Hugging Face Spaces
"""
import gradio as gr
import os
import sys
from collections import defaultdict
from datasets import load_dataset, Dataset
from huggingface_hub import login
# Check for HF_TOKEN
HF_TOKEN = os.getenv("HF_TOKEN")
# Configuration
ORIGINAL_DATASET = "smitathkr1/ord-reactions"
HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
SAMPLE_SIZE = None # Set to 100 for testing
def build_index():
"""Build the reagent index."""
if not HF_TOKEN:
return "ERROR: HF_TOKEN not found. Add it to Space secrets.", []
try:
print("\n" + "="*70)
print("Starting ORD Reagent Index Builder")
print("="*70 + "\n")
# 1. Auth
print("[1/5] Authenticating with Hugging Face...")
login(token=HF_TOKEN)
print(" OK - Authenticated\n")
# 2. Load dataset
print("[2/5] Loading dataset in streaming mode...")
ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
print(" OK - Dataset loaded\n")
# 3. Process reactions
print("[3/5] Processing reactions...")
print(" This will take 10-20 minutes, please wait...\n")
smiles_to_reactions = defaultdict(list)
name_to_reactions = defaultdict(list)
reagent_cache = {}
try:
import pubchempy as pcp
PUBCHEM_AVAILABLE = True
except ImportError:
PUBCHEM_AVAILABLE = False
processed = 0
for reaction in ds:
processed += 1
if SAMPLE_SIZE and processed > SAMPLE_SIZE:
break
if processed % 100000 == 0:
print(f" [{processed:,}] reactions processed...")
reaction_id = reaction.get('reaction_id')
# Extract SMILES
for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])):
if isinstance(smiles, str) and smiles.strip():
smiles_lower = smiles.lower().strip()
smiles_to_reactions[smiles_lower].append(reaction_id)
print(f"\n Total: {processed:,} reactions processed\n")
# 4. Build index
print("[4/5] Building index...")
index_entries = []
for smiles, reaction_ids in smiles_to_reactions.items():
unique_ids = list(set(reaction_ids))
index_entries.append({
'search_term': smiles,
'search_type': 'smiles',
'reaction_ids': unique_ids,
'count': len(unique_ids)
})
print(f" Created {len(index_entries):,} index entries\n")
# 5. Upload
print("[5/5] Uploading to Hugging Face...")
index_dataset = Dataset.from_list(index_entries)
index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
print(" OK - Upload complete\n")
# Summary
print("="*70)
print("SUCCESS!")
print("="*70)
print(f"Total reactions: {processed:,}")
print(f"Index entries: {len(index_entries):,}")
print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n")
# Sample data
sample_data = []
for i, entry in enumerate(index_entries[:10]):
sample_data.append([
entry['search_term'][:30],
entry['search_type'],
entry['count']
])
# Get all output from print statements
import io
import contextlib
return "✅ Index built successfully! Check logs above.", sample_data
except Exception as e:
error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}"
print(f"\n{error_msg}\n")
import traceback
traceback.print_exc()
return error_msg, []
# Create Gradio interface
with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧪 ORD Reagent Index Builder")
gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces")
with gr.Row():
with gr.Column():
gr.Markdown("""
### About
This tool creates a fast search index for the Open Reaction Database.
**Features:**
- Streams 2.7M reactions (no memory issues)
- SMILES indexing
- Auto-uploads to Hugging Face
**Time:** ~10-20 minutes
""")
with gr.Column():
gr.Markdown("""
### Links
[ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
[Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
""")
gr.Markdown("---")
# Main button
start_btn = gr.Button("🚀 Start Building Index", size="lg", variant="primary")
gr.Markdown("---")
# Output
gr.Markdown("### Output")
status_output = gr.Textbox(
label="Status",
lines=3,
interactive=False,
placeholder="Click start button..."
)
gr.Markdown("### Sample Index Entries (First 10)")
table_output = gr.Dataframe(
headers=["Search Term", "Type", "Count"],
interactive=False
)
# Event handler
start_btn.click(
fn=build_index,
outputs=[status_output, table_output]
)
if __name__ == "__main__":
demo.launch(share=False)