testtest123 commited on
Commit
2d50652
·
1 Parent(s): e217ee7

Simplify: Remove yield/generator - use print statements instead

Browse files
Files changed (1) hide show
  1. app.py +61 -145
app.py CHANGED
@@ -1,15 +1,14 @@
1
  """
2
- ORD Reagent Index Builder - Gradio App
3
- Runs directly on Hugging Face Spaces (Python SDK, no Docker needed)
4
  """
5
 
6
  import gradio as gr
7
  import os
 
8
  from collections import defaultdict
9
- from pathlib import Path
10
  from datasets import load_dataset, Dataset
11
  from huggingface_hub import login
12
- import sys
13
 
14
  # Check for HF_TOKEN
15
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -19,39 +18,30 @@ ORIGINAL_DATASET = "smitathkr1/ord-reactions"
19
  HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
20
  SAMPLE_SIZE = None # Set to 100 for testing
21
 
22
- def build_reagent_index(progress=gr.Progress()):
23
- """Main function to build the reagent index."""
24
 
25
  if not HF_TOKEN:
26
- yield "❌ Error: HF_TOKEN not found! Please add it to Space secrets.", [], 0
27
 
28
  try:
29
- log_messages = []
30
-
31
- progress(0, desc="Authenticating...")
32
- log_messages.append("[*] Authenticating with Hugging Face...")
33
- yield "\n".join(log_messages), [], 0.0
34
 
35
- # 1. Authentication
 
36
  login(token=HF_TOKEN)
37
- progress(0.05, desc="Authenticated successfully")
38
- log_messages.append("[OK] Authenticated successfully\n")
39
- yield "\n".join(log_messages), [], 0.05
40
 
41
  # 2. Load dataset
42
- progress(0.1, desc="Loading dataset...")
43
- log_messages.append("[*] Loading dataset in streaming mode...")
44
- yield "\n".join(log_messages), [], 0.1
45
-
46
  ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
47
- log_messages.append("[OK] Dataset loaded\n")
48
- yield "\n".join(log_messages), [], 0.1
49
 
50
  # 3. Process reactions
51
- progress(0.15, desc="Processing reactions...")
52
- log_messages.append("[*] Processing 2.7M reactions...")
53
- log_messages.append("This will take 10-20 minutes, please be patient...\n")
54
- yield "\n".join(log_messages), [], 0.15
55
 
56
  smiles_to_reactions = defaultdict(list)
57
  name_to_reactions = defaultdict(list)
@@ -60,15 +50,10 @@ def build_reagent_index(progress=gr.Progress()):
60
  try:
61
  import pubchempy as pcp
62
  PUBCHEM_AVAILABLE = True
63
- log_messages.append("[OK] PubChem available for chemical name lookup\n")
64
  except ImportError:
65
  PUBCHEM_AVAILABLE = False
66
- log_messages.append("[⚠] PubChem not available - using SMILES only\n")
67
-
68
- yield "\n".join(log_messages), [], 0.15
69
 
70
  processed = 0
71
- last_logged = 0
72
 
73
  for reaction in ds:
74
  processed += 1
@@ -76,106 +61,49 @@ def build_reagent_index(progress=gr.Progress()):
76
  if SAMPLE_SIZE and processed > SAMPLE_SIZE:
77
  break
78
 
79
- # Update progress every 10,000 reactions (less frequent for better performance)
80
- if processed - last_logged >= 10000:
81
- pct = min(0.6, (processed / 2700000) * 0.5 + 0.15)
82
- progress(pct, desc=f"Processing: {processed:,} reactions...")
83
- log_messages.append(f"[{processed:,}] Processed {processed:,} reactions...")
84
- yield "\n".join(log_messages), [], pct
85
- last_logged = processed
86
 
87
- reaction_id = reaction.get('reaction_id', 'unknown')
88
 
89
- # Extract inputs (reactants)
90
- inputs = reaction.get('inputs_smiles', [])
91
- if inputs:
92
- for smiles in inputs:
93
- if isinstance(smiles, str) and smiles.strip():
94
- smiles = smiles.lower().strip()
95
- smiles_to_reactions[smiles].append(reaction_id)
96
-
97
- if PUBCHEM_AVAILABLE and smiles not in reagent_cache:
98
- try:
99
- compounds = pcp.get_compounds(smiles, 'smiles')
100
- if compounds:
101
- name = compounds[0].iupac_name or (compounds[0].synonyms[0] if compounds[0].synonyms else None)
102
- if name:
103
- reagent_cache[smiles] = name.lower()
104
- name_to_reactions[reagent_cache[smiles]].append(reaction_id)
105
- except:
106
- pass
107
-
108
- # Extract products
109
- products = reaction.get('products_smiles', [])
110
- if products:
111
- for smiles in products:
112
- if isinstance(smiles, str) and smiles.strip():
113
- smiles = smiles.lower().strip()
114
- smiles_to_reactions[smiles].append(reaction_id)
115
-
116
- if PUBCHEM_AVAILABLE and smiles not in reagent_cache:
117
- try:
118
- compounds = pcp.get_compounds(smiles, 'smiles')
119
- if compounds:
120
- name = compounds[0].iupac_name or (compounds[0].synonyms[0] if compounds[0].synonyms else None)
121
- if name:
122
- reagent_cache[smiles] = name.lower()
123
- name_to_reactions[reagent_cache[smiles]].append(reaction_id)
124
- except:
125
- pass
126
 
127
- log_messages.append(f"\n[OK] Processed {processed:,} reactions\n")
128
- progress(0.65, desc="Building index...")
129
- log_messages.append("[*] Building index entries...")
130
- yield "\n".join(log_messages), [], 0.65
131
 
132
  # 4. Build index
 
133
  index_entries = []
134
 
135
- # Add SMILES entries
136
  for smiles, reaction_ids in smiles_to_reactions.items():
137
  unique_ids = list(set(reaction_ids))
138
  index_entries.append({
139
  'search_term': smiles,
140
  'search_type': 'smiles',
141
  'reaction_ids': unique_ids,
142
- 'count': len(unique_ids),
143
- 'common_name': reagent_cache.get(smiles, None)
144
  })
145
 
146
- # Add name entries
147
- for name, reaction_ids in name_to_reactions.items():
148
- unique_ids = list(set(reaction_ids))
149
- index_entries.append({
150
- 'search_term': name,
151
- 'search_type': 'name',
152
- 'reaction_ids': unique_ids,
153
- 'count': len(unique_ids),
154
- 'common_name': name
155
- })
156
 
157
- log_messages.append(f"[OK] Created {len(index_entries):,} index entries")
158
- log_messages.append(f" - SMILES: {len(smiles_to_reactions):,}")
159
- log_messages.append(f" - Names: {len(name_to_reactions):,}\n")
160
- progress(0.8, desc="Uploading to Hugging Face...")
161
- log_messages.append("[*] Uploading to Hugging Face...")
162
- yield "\n".join(log_messages), [], 0.8
163
-
164
- # 5. Upload to HF
165
  index_dataset = Dataset.from_list(index_entries)
166
  index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
 
167
 
168
- log_messages.append("[OK] Upload complete!\n")
169
- log_messages.append("="*70)
170
- log_messages.append("SUCCESS! Reagent index created and uploaded!")
171
- log_messages.append("="*70)
172
- log_messages.append(f"Dataset URL: https://huggingface.co/datasets/{HF_DATASET_NAME}")
173
- log_messages.append(f"Total entries: {len(index_entries):,}")
174
- log_messages.append(f"Total reactions: {processed:,}")
175
-
176
- progress(1.0, desc="Complete!")
177
 
178
- # Create sample table
179
  sample_data = []
180
  for i, entry in enumerate(index_entries[:10]):
181
  sample_data.append([
@@ -184,45 +112,44 @@ def build_reagent_index(progress=gr.Progress()):
184
  entry['count']
185
  ])
186
 
187
- yield "\n".join(log_messages), sample_data, 1.0
 
 
 
 
188
 
189
  except Exception as e:
190
- error_msg = f"❌ Error: {str(e)}\n\n{type(e).__name__}"
 
191
  import traceback
192
- error_msg += f"\n\n{traceback.format_exc()}"
193
- yield error_msg, [], 0.0
194
-
195
 
196
  # Create Gradio interface
197
  with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
198
  gr.Markdown("# 🧪 ORD Reagent Index Builder")
199
- gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face")
200
 
201
  with gr.Row():
202
  with gr.Column():
203
- gr.Markdown("### Info")
204
  gr.Markdown("""
 
205
  This tool creates a fast search index for the Open Reaction Database.
206
 
207
  **Features:**
208
  - Streams 2.7M reactions (no memory issues)
209
- - PubChem chemical name lookup
210
  - SMILES indexing
211
  - Auto-uploads to Hugging Face
212
 
213
  **Time:** ~10-20 minutes
214
-
215
- **Status:** Ready to start!
216
  """)
217
 
218
  with gr.Column():
219
- gr.Markdown("### Quick Links")
220
  gr.Markdown("""
 
221
  [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
222
 
223
- [Reagent Index](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
224
-
225
- [GitHub](https://github.com/Open-Reaction-Database/ord-interface)
226
  """)
227
 
228
  gr.Markdown("---")
@@ -232,36 +159,25 @@ with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as dem
232
 
233
  gr.Markdown("---")
234
 
235
- # Output sections
236
- gr.Markdown("### Progress & Logs")
237
- logs_output = gr.Textbox(
238
- label="Build Logs",
239
- lines=15,
240
- max_lines=20,
241
  interactive=False,
242
- placeholder="Click 'Start Building Index' to begin..."
243
- )
244
-
245
- progress_bar = gr.Slider(
246
- minimum=0,
247
- maximum=1,
248
- value=0,
249
- step=0.01,
250
- label="Progress",
251
- interactive=False
252
  )
253
 
254
  gr.Markdown("### Sample Index Entries (First 10)")
255
- sample_table = gr.Dataframe(
256
  headers=["Search Term", "Type", "Count"],
257
- label="Index Sample",
258
  interactive=False
259
  )
260
 
261
  # Event handler
262
  start_btn.click(
263
- fn=build_reagent_index,
264
- outputs=[logs_output, sample_table, progress_bar]
265
  )
266
 
267
  if __name__ == "__main__":
 
1
  """
2
+ ORD Reagent Index Builder - Gradio App (Simplified)
3
+ Runs directly on Hugging Face Spaces
4
  """
5
 
6
  import gradio as gr
7
  import os
8
+ import sys
9
  from collections import defaultdict
 
10
  from datasets import load_dataset, Dataset
11
  from huggingface_hub import login
 
12
 
13
  # Check for HF_TOKEN
14
  HF_TOKEN = os.getenv("HF_TOKEN")
 
18
  HF_DATASET_NAME = "smitathkr1/ord-reagent-index"
19
  SAMPLE_SIZE = None # Set to 100 for testing
20
 
21
+ def build_index():
22
+ """Build the reagent index."""
23
 
24
  if not HF_TOKEN:
25
+ return "ERROR: HF_TOKEN not found. Add it to Space secrets.", []
26
 
27
  try:
28
+ print("\n" + "="*70)
29
+ print("Starting ORD Reagent Index Builder")
30
+ print("="*70 + "\n")
 
 
31
 
32
+ # 1. Auth
33
+ print("[1/5] Authenticating with Hugging Face...")
34
  login(token=HF_TOKEN)
35
+ print(" OK - Authenticated\n")
 
 
36
 
37
  # 2. Load dataset
38
+ print("[2/5] Loading dataset in streaming mode...")
 
 
 
39
  ds = load_dataset(ORIGINAL_DATASET, split='train', streaming=True)
40
+ print(" OK - Dataset loaded\n")
 
41
 
42
  # 3. Process reactions
43
+ print("[3/5] Processing reactions...")
44
+ print(" This will take 10-20 minutes, please wait...\n")
 
 
45
 
46
  smiles_to_reactions = defaultdict(list)
47
  name_to_reactions = defaultdict(list)
 
50
  try:
51
  import pubchempy as pcp
52
  PUBCHEM_AVAILABLE = True
 
53
  except ImportError:
54
  PUBCHEM_AVAILABLE = False
 
 
 
55
 
56
  processed = 0
 
57
 
58
  for reaction in ds:
59
  processed += 1
 
61
  if SAMPLE_SIZE and processed > SAMPLE_SIZE:
62
  break
63
 
64
+ if processed % 100000 == 0:
65
+ print(f" [{processed:,}] reactions processed...")
 
 
 
 
 
66
 
67
+ reaction_id = reaction.get('reaction_id')
68
 
69
+ # Extract SMILES
70
+ for smiles in (reaction.get('inputs_smiles', []) + reaction.get('products_smiles', [])):
71
+ if isinstance(smiles, str) and smiles.strip():
72
+ smiles_lower = smiles.lower().strip()
73
+ smiles_to_reactions[smiles_lower].append(reaction_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ print(f"\n Total: {processed:,} reactions processed\n")
 
 
 
76
 
77
  # 4. Build index
78
+ print("[4/5] Building index...")
79
  index_entries = []
80
 
 
81
  for smiles, reaction_ids in smiles_to_reactions.items():
82
  unique_ids = list(set(reaction_ids))
83
  index_entries.append({
84
  'search_term': smiles,
85
  'search_type': 'smiles',
86
  'reaction_ids': unique_ids,
87
+ 'count': len(unique_ids)
 
88
  })
89
 
90
+ print(f" Created {len(index_entries):,} index entries\n")
 
 
 
 
 
 
 
 
 
91
 
92
+ # 5. Upload
93
+ print("[5/5] Uploading to Hugging Face...")
 
 
 
 
 
 
94
  index_dataset = Dataset.from_list(index_entries)
95
  index_dataset.push_to_hub(HF_DATASET_NAME, private=False, token=HF_TOKEN)
96
+ print(" OK - Upload complete\n")
97
 
98
+ # Summary
99
+ print("="*70)
100
+ print("SUCCESS!")
101
+ print("="*70)
102
+ print(f"Total reactions: {processed:,}")
103
+ print(f"Index entries: {len(index_entries):,}")
104
+ print(f"Dataset: https://huggingface.co/datasets/{HF_DATASET_NAME}\n")
 
 
105
 
106
+ # Sample data
107
  sample_data = []
108
  for i, entry in enumerate(index_entries[:10]):
109
  sample_data.append([
 
112
  entry['count']
113
  ])
114
 
115
+ # Get all output from print statements
116
+ import io
117
+ import contextlib
118
+
119
+ return "✅ Index built successfully! Check logs above.", sample_data
120
 
121
  except Exception as e:
122
+ error_msg = f"ERROR: {str(e)}\n\nDetails:\n{type(e).__name__}"
123
+ print(f"\n{error_msg}\n")
124
  import traceback
125
+ traceback.print_exc()
126
+ return error_msg, []
 
127
 
128
  # Create Gradio interface
129
  with gr.Blocks(title="ORD Reagent Index Builder", theme=gr.themes.Soft()) as demo:
130
  gr.Markdown("# 🧪 ORD Reagent Index Builder")
131
+ gr.Markdown("Create fast search index for 2.7M reactions on Hugging Face Spaces")
132
 
133
  with gr.Row():
134
  with gr.Column():
 
135
  gr.Markdown("""
136
+ ### About
137
  This tool creates a fast search index for the Open Reaction Database.
138
 
139
  **Features:**
140
  - Streams 2.7M reactions (no memory issues)
 
141
  - SMILES indexing
142
  - Auto-uploads to Hugging Face
143
 
144
  **Time:** ~10-20 minutes
 
 
145
  """)
146
 
147
  with gr.Column():
 
148
  gr.Markdown("""
149
+ ### Links
150
  [ORD Dataset](https://huggingface.co/datasets/smitathkr1/ord-reactions)
151
 
152
+ [Index Dataset](https://huggingface.co/datasets/smitathkr1/ord-reagent-index)
 
 
153
  """)
154
 
155
  gr.Markdown("---")
 
159
 
160
  gr.Markdown("---")
161
 
162
+ # Output
163
+ gr.Markdown("### Output")
164
+ status_output = gr.Textbox(
165
+ label="Status",
166
+ lines=3,
 
167
  interactive=False,
168
+ placeholder="Click start button..."
 
 
 
 
 
 
 
 
 
169
  )
170
 
171
  gr.Markdown("### Sample Index Entries (First 10)")
172
+ table_output = gr.Dataframe(
173
  headers=["Search Term", "Type", "Count"],
 
174
  interactive=False
175
  )
176
 
177
  # Event handler
178
  start_btn.click(
179
+ fn=build_index,
180
+ outputs=[status_output, table_output]
181
  )
182
 
183
  if __name__ == "__main__":