Qianhui19 commited on
Commit
caf4896
·
verified ·
1 Parent(s): 22e7ff6

Delete step2_CECs_annotating_agent_v1.0.py

Browse files
Files changed (1) hide show
  1. step2_CECs_annotating_agent_v1.0.py +0 -578
step2_CECs_annotating_agent_v1.0.py DELETED
@@ -1,578 +0,0 @@
1
- # ==================== Compound Batch Query Tool (Desktop Version) ====================
2
- # Supports batch query (AC50 matching function removed)
3
-
4
- import tkinter as tk
5
- from tkinter import ttk, filedialog, messagebox, scrolledtext
6
- import pandas as pd
7
- import requests
8
- import json
9
- import os
10
- import time
11
- from typing import Optional, Dict, List
12
- from datetime import datetime
13
- import threading
14
- import sys
15
-
16
-
17
- # ==================== Core Function Module ====================
18
- class DifyBasicChat:
19
- """Dify Basic Chat Function Encapsulation"""
20
-
21
- def __init__(self, api_key: str, base_url: str = "http://localhost/v1"):
22
- self.api_key = api_key
23
- self.base_url = base_url.rstrip("/")
24
- self.headers = {
25
- "Authorization": f"Bearer {api_key}",
26
- "Content-Type": "application/json"
27
- }
28
-
29
- def send_message(
30
- self,
31
- query: str,
32
- user: str,
33
- conversation_id: Optional[str] = None,
34
- response_mode: str = "blocking",
35
- inputs: dict = None
36
- ) -> tuple:
37
- """Send chat message"""
38
- url = f"{self.base_url}/chat-messages"
39
- payload = {
40
- "query": query,
41
- "user": user,
42
- "response_mode": response_mode,
43
- "inputs": inputs or {}
44
- }
45
- if conversation_id:
46
- payload["conversation_id"] = conversation_id
47
-
48
- full_response = None
49
- try:
50
- if response_mode == "blocking":
51
- res = requests.post(url, headers=self.headers, json=payload, timeout=120)
52
- res.raise_for_status()
53
- full_response = res.json()
54
- answer = full_response.get("answer", "")
55
- conv_id = full_response.get("conversation_id")
56
- return answer, conv_id, full_response
57
-
58
- else:
59
- full_answer = ""
60
- conv_id = None
61
- res = requests.post(url, headers=self.headers, json=payload, stream=True, timeout=120)
62
- res.raise_for_status()
63
- for line in res.iter_lines():
64
- if line:
65
- line_data = line.decode("utf-8").lstrip("data: ")
66
- if line_data:
67
- try:
68
- data = json.loads(line_data)
69
- full_response = data
70
- if data.get("event") == "message":
71
- full_answer += data.get("answer", "")
72
- elif data.get("event") == "message_end":
73
- conv_id = data.get("conversation_id")
74
- break
75
- elif data.get("event") == "error":
76
- raise Exception(f"Streaming Error: {data.get('message')}")
77
- except json.JSONDecodeError:
78
- continue
79
- return full_answer, conv_id, full_response
80
-
81
- except requests.exceptions.RequestException as e:
82
- error_msg = f"Request Failed: {str(e)}"
83
- return error_msg, None, {"error": error_msg}
84
-
85
-
86
- def parse_dify_response(answer_text: str) -> dict:
87
- """Parse classification and complete information returned by Dify"""
88
- result = {
89
- #"CASRN": "",
90
- "Main Category": "",
91
- "Additional Category 1": "",
92
- "Additional Category 2": "",
93
- "EndpointName": [], # Keep for compatibility, no longer used for matching
94
- "XLogP": "",
95
- "BioPathway": "",
96
- "ToxicityInfo": "",
97
- "KnownUse": "",
98
- "DisorderDisease": ""
99
- }
100
-
101
- try:
102
- clean_text = answer_text.strip()
103
- # Clean code block markers
104
- if clean_text.startswith("```json"):
105
- clean_text = clean_text.replace("```json", "").replace("```", "").strip()
106
- elif clean_text.startswith("```"):
107
- clean_text = clean_text.replace("```", "").strip()
108
-
109
- # Parse JSON
110
- response_json = json.loads(clean_text)
111
-
112
- if isinstance(response_json, dict):
113
- # Get compound name (first key)
114
- compound_name = next(iter(response_json.keys())) if response_json else ""
115
-
116
- if compound_name and isinstance(response_json.get(compound_name), dict):
117
- # Nested format: {"CompoundName": {...}}
118
- category_info = response_json[compound_name]
119
-
120
- # Extract all fields
121
- #result["CASRN"] = category_info.get("CASRN", "")
122
- result["Main Category"] = category_info.get("Main Category", "")
123
- result["Additional Category 1"] = category_info.get("Additional Category 1", "")
124
- result["Additional Category 2"] = category_info.get("Additional Category 2", "")
125
-
126
- # Process EndpointName - may be list or string
127
- endpoint_value = category_info.get("EndpointName", [])
128
- if isinstance(endpoint_value, list):
129
- result["EndpointName"] = endpoint_value
130
- elif isinstance(endpoint_value, str):
131
- result["EndpointName"] = [endpoint_value] if endpoint_value else []
132
-
133
- result["XLogP"] = category_info.get("XLogP", "")
134
- result["BioPathway"] = category_info.get("BioPathway", "")
135
- result["ToxicityInfo"] = category_info.get("ToxicityInfo", "")
136
- result["KnownUse"] = category_info.get("KnownUse", "")
137
- result["DisorderDisease"] = category_info.get("DisorderDisease", "")
138
-
139
- else:
140
- # Flat format (compatible with old format)
141
- result["Main Category"] = response_json.get("Main Category", "")
142
- result["Additional Category 1"] = response_json.get("Additional Category 1", "")
143
- result["Additional Category 2"] = response_json.get("Additional Category 2", "")
144
-
145
- except json.JSONDecodeError as e:
146
- result["Main Category"] = f"JSON Parsing Error: {str(e)}"
147
- except Exception as e:
148
- result["Main Category"] = f"Parsing Failed: {str(e)}"
149
-
150
- return result
151
-
152
-
153
- def normalize_compound_name(name: str) -> str:
154
- """Normalize compound name (remove quotes, etc.)"""
155
- if not isinstance(name, str):
156
- return ""
157
-
158
- # Remove quotes
159
- name = name.strip()
160
- if name.startswith('"') and name.endswith('"'):
161
- name = name[1:-1]
162
- elif name.startswith("'") and name.endswith("'"):
163
- name = name[1:-1]
164
-
165
- # Remove extra spaces
166
- name = ' '.join(name.split())
167
-
168
- return name
169
-
170
-
171
- def expand_endpoint_rows(parsed_result: dict, compound_name: str) -> list:
172
- """
173
- Expand EndpointName into multiple rows (without AC50 matching)
174
- """
175
- rows = []
176
-
177
- # Normalize compound name
178
- compound_clean = normalize_compound_name(compound_name)
179
-
180
- endpoint_names = parsed_result.get("EndpointName", [])
181
-
182
- if not endpoint_names:
183
- # Create one row if no EndpointName
184
- row = {
185
- "CompoundName": compound_clean,
186
- "OriginalCompoundName": compound_name,
187
- #"CASRN": parsed_result.get("CASRN", ""),
188
- "MainCategory": parsed_result.get("Main Category", ""),
189
- "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
190
- "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
191
- "EndpointName": "",
192
- "XLogP": parsed_result.get("XLogP", ""),
193
- "BioPathway": parsed_result.get("BioPathway", ""),
194
- "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
195
- "KnownUse": parsed_result.get("KnownUse", ""),
196
- "DisorderDisease": parsed_result.get("DisorderDisease", "")
197
- }
198
- rows.append(row)
199
- else:
200
- # Create one row per endpoint (without AC50 matching)
201
- for endpoint in endpoint_names:
202
- row = {
203
- "CompoundName": compound_clean,
204
- "OriginalCompoundName": compound_name,
205
- #"CASRN": parsed_result.get("CASRN", ""),
206
- "MainCategory": parsed_result.get("Main Category", ""),
207
- "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
208
- "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
209
- "EndpointName": endpoint,
210
- "XLogP": parsed_result.get("XLogP", ""),
211
- "BioPathway": parsed_result.get("BioPathway", ""),
212
- "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
213
- "KnownUse": parsed_result.get("KnownUse", ""),
214
- "DisorderDisease": parsed_result.get("DisorderDisease", "")
215
- }
216
- rows.append(row)
217
-
218
- return rows
219
-
220
-
221
- def batch_process_compounds_gui(
222
- csv_path: str,
223
- save_root: str,
224
- api_key: str,
225
- base_url: str,
226
- log_text: tk.Text,
227
- progress_var: tk.DoubleVar,
228
- user_id: str = "batch_compound_user",
229
- compound_col: str = "IUPAC_name",
230
- batch_num: int = 1,
231
- csv_encoding: str = "utf-8",
232
- csv_sep: str = ","
233
- ):
234
- """Batch process compounds (adapted for GUI, AC50 matching removed)"""
235
-
236
- def log(message, color="black"):
237
- """Output log to GUI text box"""
238
- log_text.config(state=tk.NORMAL)
239
- log_text.insert(tk.END, f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}\n")
240
- log_text.tag_add(color, f"end-2l", f"end-1l")
241
- log_text.tag_config(color, foreground=color)
242
- log_text.see(tk.END)
243
- log_text.config(state=tk.DISABLED)
244
- log_text.update()
245
-
246
- try:
247
- # Initialize Dify client
248
- log("Initializing Dify connection...", "blue")
249
- chat = DifyBasicChat(api_key=api_key, base_url=base_url)
250
-
251
- # Create save folder
252
- result_folder = os.path.join(save_root,
253
- f"Compound_Classification_Results_Batch{batch_num}_{datetime.now().strftime('%Y%m%d%H%M%S')}")
254
- os.makedirs(result_folder, exist_ok=True)
255
- log(f"Result save folder: {result_folder}", "blue")
256
-
257
- # Read CSV
258
- log("Reading CSV file...", "blue")
259
- df = pd.read_csv(
260
- csv_path,
261
- encoding=csv_encoding,
262
- sep=csv_sep,
263
- na_filter=True
264
- )
265
- df = df.reset_index(drop=True)
266
-
267
- # Check if column exists
268
- if compound_col not in df.columns:
269
- raise ValueError(
270
- f"Column not found in CSV: [{compound_col}]\n"
271
- f"Current CSV columns: {list(df.columns)}"
272
- )
273
-
274
- # Remove duplicates and empty values
275
- compounds = df[compound_col].dropna().unique()
276
- total = len(compounds)
277
- log(f"Successfully read {total} non-empty and unique compound names", "green")
278
-
279
- all_rows = [] # Store all row data
280
- failed_list = []
281
-
282
- # Batch processing
283
- for idx, compound in enumerate(compounds, 1):
284
- compound = str(compound).strip()
285
- if not compound:
286
- continue
287
-
288
- # Update progress
289
- progress = (idx / total) * 100
290
- progress_var.set(progress)
291
- log(f"Processing {idx}/{total}:{compound}", "black")
292
-
293
- try:
294
- # Call Dify API
295
- answer, _, full_response = chat.send_message(
296
- query=compound,
297
- user=f"{user_id}_batch{batch_num}",
298
- response_mode="blocking"
299
- )
300
-
301
- # Parse results
302
- parsed_categories = parse_dify_response(answer)
303
-
304
- # Expand EndpointName into multiple rows (without AC50 matching)
305
- expanded_rows = expand_endpoint_rows(parsed_categories, compound)
306
- all_rows.extend(expanded_rows)
307
-
308
- # Save original record (for debugging)
309
- record_file = os.path.join(result_folder, f"Original_Record_{idx}.json")
310
- with open(record_file, "w", encoding="utf-8") as f:
311
- json.dump({
312
- "Input Compound": compound,
313
- "Dify Original Response": answer,
314
- "Complete Response": full_response,
315
- "Parsed Classification": parsed_categories,
316
- "Expanded Rows Count": len(expanded_rows),
317
- "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
318
- }, f, ensure_ascii=False, indent=4)
319
-
320
- log(f"✅ Processing completed: {compound} | Main Category: {parsed_categories['Main Category']} | Generated {len(expanded_rows)} rows",
321
- "green")
322
- time.sleep(0.5) # Avoid too fast requests
323
-
324
- except Exception as e:
325
- error_msg = str(e)
326
- log(f"❌ Processing failed: {compound} | Error: {error_msg}", "red")
327
- failed_list.append({
328
- "CompoundName": normalize_compound_name(compound),
329
- "OriginalCompoundName": compound,
330
- #"CASRN": "",
331
- "MainCategory": f"Processing Failed: {error_msg}",
332
- "AdditionalCategory1": "",
333
- "AdditionalCategory2": "",
334
- "EndpointName": "",
335
- "XLogP": "",
336
- "BioPathway": "",
337
- "ToxicityInfo": "",
338
- "KnownUse": "",
339
- "DisorderDisease": ""
340
- })
341
-
342
- # Merge results and save
343
- result_df = pd.DataFrame(all_rows)
344
-
345
- # Add failed records
346
- if failed_list:
347
- failed_df = pd.DataFrame(failed_list)
348
- result_df = pd.concat([result_df, failed_df], ignore_index=True)
349
-
350
- # Define column order
351
- column_order = [
352
- "CompoundName",
353
- "OriginalCompoundName",
354
- #"CASRN",
355
- "MainCategory",
356
- "AdditionalCategory1",
357
- "AdditionalCategory2",
358
- "EndpointName",
359
- "XLogP",
360
- "BioPathway",
361
- "ToxicityInfo",
362
- "KnownUse",
363
- "DisorderDisease"
364
- ]
365
-
366
- # Ensure all columns exist
367
- for col in column_order:
368
- if col not in result_df.columns:
369
- result_df[col] = ""
370
-
371
- # Reorder columns
372
- result_df = result_df.reindex(columns=column_order)
373
-
374
- # Save final CSV
375
- csv_filename = f"Compound_Query_Results_Batch{batch_num}.csv"
376
- csv_path_out = os.path.join(result_folder, csv_filename)
377
- result_df.to_csv(csv_path_out, index=False, encoding="utf-8-sig")
378
- log(f"📄 Result file saved to: {csv_path_out}", "blue")
379
- log(f"📊 Total Rows: {len(result_df)} rows", "blue")
380
-
381
- # Save failed list (separate file)
382
- if failed_list:
383
- fail_file = os.path.join(result_folder, f"Failed_List_Batch{batch_num}.csv")
384
- pd.DataFrame(failed_list).to_csv(fail_file, index=False, encoding="utf-8-sig")
385
- log(f"❌ {len(failed_list)} compounds failed to process, details: {fail_file}", "red")
386
-
387
- # Update progress and log after completion
388
- progress_var.set(100)
389
- log(f"\n{'=' * 40}", "blue")
390
- log(f"🏁 Processing Complete!", "green")
391
- log(f"{'=' * 40}", "blue")
392
- log(f"📊 Statistics: Total Compounds={total} | Successful Rows={len(all_rows)} | Failed Compounds={len(failed_list)}", "blue")
393
- log(f"📁 All results saved to: {result_folder}", "blue")
394
-
395
- # Ask if open result folder
396
- if messagebox.askyesno("Processing Complete", f"Batch processing completed!\nTotal {len(result_df)} rows of data generated\nOpen result folder?"):
397
- if os.name == 'nt': # Windows
398
- os.startfile(result_folder)
399
- elif os.name == 'posix': # macOS, Linux
400
- import subprocess
401
- try:
402
- if sys.platform == 'darwin':
403
- subprocess.run(['open', result_folder])
404
- else:
405
- subprocess.run(['xdg-open', result_folder])
406
- except:
407
- pass
408
-
409
- except Exception as e:
410
- log(f"❌ Overall processing failed: {str(e)}", "red")
411
- messagebox.showerror("Error", f"Processing failed: {str(e)}")
412
- finally:
413
- # Reset progress
414
- progress_var.set(0)
415
-
416
-
417
- # ==================== Graphical User Interface Module ====================
418
- class CompoundBatchToolGUI:
419
- def __init__(self, root):
420
- self.root = root
421
- self.root.title("Compound Batch Query Tool v1.0")
422
- self.root.geometry("850x700")
423
- self.root.resizable(True, True)
424
-
425
- # Default configuration
426
- self.default_api_key = "app-QRGuoLVqSksMsG4t9O53cITj"
427
- self.default_base_url = "http://192.168.0.179:8080/v1"
428
- self.default_save_root = "./Compound_Query_Results"
429
- self.default_compound_col = "IUPAC_name"
430
- self.default_csv_encoding = "utf-8"
431
- self.default_csv_sep = ","
432
-
433
- # Create main frame
434
- main_frame = ttk.Frame(root, padding="20")
435
- main_frame.pack(fill=tk.BOTH, expand=True)
436
-
437
- # 1. File selection area
438
- file_frame = ttk.LabelFrame(main_frame, text="1. Select CSV File", padding="10")
439
- file_frame.pack(fill=tk.X, pady=5)
440
-
441
- self.csv_path_var = tk.StringVar()
442
- ttk.Entry(file_frame, textvariable=self.csv_path_var, state="readonly", width=65).grid(row=0, column=1, padx=5,
443
- pady=5)
444
- ttk.Button(file_frame, text="Select File", command=self.select_csv_file).grid(row=0, column=0, padx=5, pady=5)
445
-
446
- # 2. Parameter configuration area
447
- param_frame = ttk.LabelFrame(main_frame, text="2. Parameter Configuration", padding="10")
448
- param_frame.pack(fill=tk.X, pady=5)
449
-
450
- # 2.1 Dify configuration
451
- ttk.Label(param_frame, text="Dify API Key:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=3)
452
- self.api_key_var = tk.StringVar(value=self.default_api_key)
453
- ttk.Entry(param_frame, textvariable=self.api_key_var, width=60).grid(row=0, column=1, columnspan=3, padx=5,
454
- pady=3)
455
-
456
- ttk.Label(param_frame, text="Dify URL:").grid(row=1, column=0, sticky=tk.W, padx=5, pady=3)
457
- self.base_url_var = tk.StringVar(value=self.default_base_url)
458
- ttk.Entry(param_frame, textvariable=self.base_url_var, width=60).grid(row=1, column=1, columnspan=3, padx=5,
459
- pady=3)
460
-
461
- # 2.2 CSV configuration
462
- ttk.Label(param_frame, text="Compound Column Name:").grid(row=2, column=0, sticky=tk.W, padx=5, pady=3)
463
- self.compound_col_var = tk.StringVar(value=self.default_compound_col)
464
- ttk.Entry(param_frame, textvariable=self.compound_col_var, width=20).grid(row=2, column=1, padx=5, pady=3)
465
-
466
- ttk.Label(param_frame, text="CSV Encoding:").grid(row=2, column=2, sticky=tk.W, padx=5, pady=3)
467
- self.csv_encoding_var = tk.StringVar(value=self.default_csv_encoding)
468
- ttk.Entry(param_frame, textvariable=self.csv_encoding_var, width=15).grid(row=2, column=3, padx=5, pady=3)
469
-
470
- ttk.Label(param_frame, text="CSV Separator:").grid(row=3, column=0, sticky=tk.W, padx=5, pady=3)
471
- self.csv_sep_var = tk.StringVar(value=self.default_csv_sep)
472
- ttk.Entry(param_frame, textvariable=self.csv_sep_var, width=20).grid(row=3, column=1, padx=5, pady=3)
473
-
474
- # 2.3 Save configuration (AC50 folder removed)
475
- ttk.Label(param_frame, text="Result Save Path:").grid(row=4, column=0, sticky=tk.W, padx=5, pady=3)
476
- self.save_root_var = tk.StringVar(value=self.default_save_root)
477
- ttk.Entry(param_frame, textvariable=self.save_root_var, width=50).grid(row=4, column=1, columnspan=2, padx=5,
478
- pady=3)
479
- ttk.Button(param_frame, text="Select Path", command=self.select_save_root).grid(row=4, column=3, padx=5, pady=3)
480
-
481
- # 3. Operation area
482
- op_frame = ttk.LabelFrame(main_frame, text="3. Start Processing", padding="10")
483
- op_frame.pack(fill=tk.X, pady=5)
484
-
485
- self.progress_var = tk.DoubleVar()
486
- progress_bar = ttk.Progressbar(op_frame, variable=self.progress_var, maximum=100)
487
- progress_bar.pack(fill=tk.X, padx=5, pady=5)
488
-
489
- self.start_btn = ttk.Button(op_frame, text="Start Batch Processing", command=self.start_processing)
490
- self.start_btn.pack(pady=5)
491
-
492
- # 4. Log output area
493
- log_frame = ttk.LabelFrame(main_frame, text="4. Processing Log", padding="10")
494
- log_frame.pack(fill=tk.BOTH, expand=True, pady=5)
495
-
496
- self.log_text = scrolledtext.ScrolledText(log_frame, wrap=tk.WORD, state=tk.DISABLED)
497
- self.log_text.pack(fill=tk.BOTH, expand=True)
498
- # Set log color tags
499
- self.log_text.tag_config("red", foreground="red")
500
- self.log_text.tag_config("green", foreground="green")
501
- self.log_text.tag_config("blue", foreground="blue")
502
- self.log_text.tag_config("orange", foreground="orange")
503
- self.log_text.tag_config("gray", foreground="gray")
504
-
505
- # 5. Bottom tip (AC50 related tip removed)
506
- tip_label = ttk.Label(main_frame,
507
- text="Tip: Each endpoint returned by Dify generates a separate row in the result",
508
- foreground="gray")
509
- tip_label.pack(side=tk.BOTTOM, pady=10)
510
-
511
- def select_csv_file(self):
512
- """Select CSV file"""
513
- file_path = filedialog.askopenfilename(
514
- title="Select Compound CSV File",
515
- filetypes=[("CSV Files", "*.csv"), ("All Files", "*.*")]
516
- )
517
- if file_path:
518
- self.csv_path_var.set(file_path)
519
-
520
- def select_save_root(self):
521
- """Select save path"""
522
- folder_path = filedialog.askdirectory(title="Select Result Save Folder")
523
- if folder_path:
524
- self.save_root_var.set(folder_path)
525
-
526
- def start_processing(self):
527
- """Start batch processing (new thread to avoid UI freezing)"""
528
- # Verify required parameters
529
- csv_path = self.csv_path_var.get()
530
- if not csv_path:
531
- messagebox.showwarning("Warning", "Please select a CSV file first!")
532
- return
533
-
534
- api_key = self.api_key_var.get().strip()
535
- if not api_key:
536
- messagebox.showwarning("Warning", "Please fill in the Dify API Key!")
537
- return
538
-
539
- base_url = self.base_url_var.get().strip()
540
- if not base_url:
541
- messagebox.showwarning("Warning", "Please fill in the Dify URL!")
542
- return
543
-
544
- # Disable start button to prevent duplicate clicks
545
- self.start_btn.config(state=tk.DISABLED)
546
-
547
- # Clear log
548
- self.log_text.config(state=tk.NORMAL)
549
- self.log_text.delete(1.0, tk.END)
550
- self.log_text.config(state=tk.DISABLED)
551
-
552
- # New thread for processing (avoid UI freezing)
553
- def process_thread():
554
- try:
555
- batch_process_compounds_gui(
556
- csv_path=csv_path,
557
- save_root=self.save_root_var.get(),
558
- api_key=api_key,
559
- base_url=base_url,
560
- log_text=self.log_text,
561
- progress_var=self.progress_var,
562
- compound_col=self.compound_col_var.get(),
563
- csv_encoding=self.csv_encoding_var.get(),
564
- csv_sep=self.csv_sep_var.get()
565
- )
566
- finally:
567
- # Restore button state
568
- self.start_btn.config(state=tk.NORMAL)
569
-
570
- threading.Thread(target=process_thread, daemon=True).start()
571
-
572
-
573
- # ==================== Start Program ====================
574
- if __name__ == "__main__":
575
- # Normal GUI startup (AC50 debug mode removed)
576
- root = tk.Tk()
577
- app = CompoundBatchToolGUI(root)
578
- root.mainloop()