|
|
""" |
|
|
ORD Reagent Index Builder - Gradio App (Simplified) |
|
|
Runs directly on Hugging Face Spaces |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import os |
|
|
import sys |
|
|
from collections import defaultdict |
|
|
from datasets import load_dataset, Dataset |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
ORIGINAL_DATASET = "smitathkr1/ord-reactions" |
|
|
HF_DATASET_NAME = "smitathkr1/ord-reagent-index" |
|
|
SAMPLE_SIZE = None |
|
|
|
|
|
def build_index(): |
|
|
"""Build the reagent index.""" |
|
|
|
|
|
if not HF_TOKEN: |
|
|
return "ERROR: HF_TOKEN not found. Add it to Space secrets.", [] |
|
|
|
|
|
try: |
|
|
print("\n" + "="*70) |
|
|
print("Starting ORD Reagent Index Builder") |
|
|
print("="*70 + "\n") |
|
|
|
|
|
|
|
|
print("[1/5] Authenticating with Hugging Face...") |
|
|
login(token=HF_TOKEN) |
|
|
print(" OK - Authenticated\n") |
|
|
|
|
|
|
|
|
print("[2/5] Loading dataset in streaming mode...") |
|
|
ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True) |
|
|
print(" OK - Dataset loaded\n") |
|
|
|
|
|
|
|
|
print("[3/5] Processing reactions...") |
|
|
print(" This will take 10-20 minutes, please wait...\n") |
|
|
|
|
|
smiles_to_reactions = defaultdict(list) |
|
|
name_to_reactions = defaultdict(list) |
|
|
reagent_cache = {} |
|
|
|
|
|
try: |
|
|
import pubchempy as pcp |
|
|
PUBCHEM_AVAILABLE = True |
|
|
except ImportError: |
|
|
PUBCHEM_AVAILABLE = False |
|
|
|
|
|
processed = 0 |
|
|
|
|
|
for reaction in ds: |
|
|
processed += 1 |
|
|
|
|
|
if SAMPLE_SIZE and processed > SAMPLE_SIZE: |
|
|
break |
|
|
|
|
|
if processed % 100000 == 0: |
|
|
print(f" [{processed:,}] reactions processed...") |
|
|
|
|
|
reaction_id = reaction.get('reaction_id') |
|
|
|
|
|
|
|
|
for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])): |
|
|
if isinstance(smiles, str) and smiles.strip(): |
|
|
smiles_lower = smiles.lower().strip() |
|
|
smiles_to_reactions[smiles_lower].append(reaction_id) |
|
|
|
|
|
print(f"\n Total: {processed:,} reactions processed\n") |
|
|
|
|
|
|
|
|
print("[4/5] Building index...") |
|
|
index_entries = [] |
|
|
|
|
|
for smiles, reaction_ids in smiles_to_reactions.items(): |
|
|
unique_ids = list(set(reaction_ids)) |
|
|
index_entries.append({ |
|
|
'search_term': smiles, |
|
|
'search_type': 'smiles', |
|
|
'reaction_ids': unique_ids, |
|
|
'count': len(unique_ids) |
|
|
}) |
|
|
|
|
|
print(f" Created {len(index_entries):,} index entries\n") |
|
|
|
|
|
|
|
|
print("[5/5] Uploading to Hugging Face...") |
|
|
index_dataset = Dataset.from_list(index_entries) |
|
|
index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN) |
|
|
print(" OK - Upload complete\n") |
|
|
|
|
|
|
|
|
print("="*70) |
|
|
print("SUCCESS!") |
|
|
print("="*70) |
|
|
print(f"Total reactions: {processed:,}") |
|
|
print(f"Index entries: {len(index_entries):,}") |
|
|
print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n") |
|
|
|
|
|
|
|
|
sample_data = [] |
|
|
for i, entry in enumerate(index_entries[:10]): |
|
|
sample_data.append([ |
|
|
entry['search_term'][:30], |
|
|
entry['search_type'], |
|
|
entry['count'] |
|
|
]) |
|
|
|
|
|
|
|
|
import io |
|
|
import contextlib |
|
|
|
|
|
return "✅ Index built successfully! Check logs above.", sample_data |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}" |
|
|
print(f"\n{error_msg}\n") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return error_msg, [] |
|
|
|
|
|
|
|
|
with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# 🧪 ORD Reagent Index Builder") |
|
|
gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown(""" |
|
|
### About |
|
|
This tool creates a fast search index for the Open Reaction Database. |
|
|
|
|
|
**Features:** |
|
|
- Streams 2.7M reactions (no memory issues) |
|
|
- SMILES indexing |
|
|
- Auto-uploads to Hugging Face |
|
|
|
|
|
**Time:** ~10-20 minutes |
|
|
""") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown(""" |
|
|
### Links |
|
|
[ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions) |
|
|
|
|
|
[Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index) |
|
|
""") |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
start_btn = gr.Button("🚀 Start Building Index", size="lg", variant="primary") |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
gr.Markdown("### Output") |
|
|
status_output = gr.Textbox( |
|
|
label="Status", |
|
|
lines=3, |
|
|
interactive=False, |
|
|
placeholder="Click start button..." |
|
|
) |
|
|
|
|
|
gr.Markdown("### Sample Index Entries (First 10)") |
|
|
table_output = gr.Dataframe( |
|
|
headers=["Search Term", "Type", "Count"], |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
start_btn.click( |
|
|
fn=build_index, |
|
|
outputs=[status_output, table_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=False) |
|
|
|