Qianhui19 commited on
Commit
91a6d73
·
verified ·
1 Parent(s): 6b3fc20

Upload 2 files

Browse files
step1_pubchemlite_invitrodb_to_dify_en.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import dependency libraries
2
+ from flask import Flask, request, jsonify
3
+ import pandas as pd
4
+ import sqlalchemy
5
+ from sqlalchemy.exc import SQLAlchemyError
6
+
7
+ # Create Flask application
8
+ app = Flask(__name__)
9
+
10
+ # Configure connection information for two databases (please modify username/password/address according to actual environment)
11
+ DB_CONFIGS = {
12
+ "pubchemlite": {
13
+ "uri": "mysql+pymysql://sql_user:SQLUSER@localhost:3306/pubchemlite"
14
+ },
15
+ "invitrodb_v4_3": {
16
+ "uri": "mysql+pymysql://sql_user:SQLUSER@localhost:3306/invitrodb_v4_3" # Please confirm database name and password
17
+ }
18
+ }
19
+
20
+ # List of allowed database identifiers
21
+ ALLOWED_DB_IDENTIFIERS = list(DB_CONFIGS.keys())
22
+ # Only allow SELECT operations (security restriction)
23
+ ALLOWED_SQL_OPERATIONS = ["select"]
24
+
25
+
26
+ # Core interface: /execute_sql (supports POST method, GET only returns description)
27
+ @app.route('/execute_sql', methods=['GET', 'POST'])
28
+ def execute_sql():
29
+ # GET request returns interface description
30
+ if request.method == 'GET':
31
+ return jsonify({
32
+ "status": "info",
33
+ "message": "Please use POST method to call, parameters include db_identifier (pubchemlite/invitrodb_v4_3) and sql (SELECT only)"
34
+ })
35
+
36
+ try:
37
+ # 1. Get and validate request parameters
38
+ data = request.json
39
+ if not data:
40
+ return jsonify({
41
+ "status": "error",
42
+ "message": "Request body cannot be empty, must contain db_identifier and sql parameters"
43
+ }), 400
44
+
45
+ # Get database identifier and validate
46
+ db_identifier = data.get("db_identifier")
47
+ if not db_identifier:
48
+ return jsonify({
49
+ "status": "error",
50
+ "message": "Missing required parameter: db_identifier (optional values: pubchemlite/invitrodb_v4_3)"
51
+ }), 400
52
+
53
+ if db_identifier not in ALLOWED_DB_IDENTIFIERS:
54
+ return jsonify({
55
+ "status": "error",
56
+ "message": f"Invalid db_identifier: {db_identifier}, only supports: {','.join(ALLOWED_DB_IDENTIFIERS)}"
57
+ }), 400
58
+
59
+ # Get SQL and validate
60
+ sql = data.get("sql")
61
+ if not sql:
62
+ return jsonify({
63
+ "status": "error",
64
+ "message": "Missing required parameter: sql (SELECT query statements only)"
65
+ }), 400
66
+
67
+ # Security check: only allow SELECT operations
68
+ sql_lower = sql.strip().lower()
69
+ if not sql_lower.startswith(tuple(ALLOWED_SQL_OPERATIONS)):
70
+ return jsonify({
71
+ "status": "error",
72
+ "message": "Only SELECT query operations are supported, dangerous operations like INSERT/UPDATE/DELETE/DROP are prohibited"
73
+ }), 400
74
+
75
+ # 2. Create connection engine for the corresponding database
76
+ db_uri = DB_CONFIGS[db_identifier]["uri"]
77
+ engine = sqlalchemy.create_engine(
78
+ db_uri,
79
+ pool_pre_ping=True, # Check connection validity
80
+ pool_recycle=3600 # Recycle connections every 1 hour to prevent timeouts
81
+ )
82
+
83
+ # 3. Execute SQL query
84
+ with engine.connect() as conn:
85
+ df = pd.read_sql(sql, conn)
86
+
87
+ # 4. Return successful result
88
+ return jsonify({
89
+ "status": "success",
90
+ "data": df.to_dict(orient="records"),
91
+ "message": ""
92
+ })
93
+
94
+ # Database execution error (table/field does not exist, etc.)
95
+ except SQLAlchemyError as e:
96
+ error_msg = str(e)
97
+ # Refine error messages to fit the two database scenarios
98
+ if "pubchemlite_ccs" in error_msg and db_identifier == "invitrodb_v4_3":
99
+ error_msg = f"pubchemlite_ccs table does not exist in invitrodb_v4_3 database (this table is only supported in pubchemlite): {error_msg}"
100
+ elif "CompoundName" in error_msg and db_identifier == "invitrodb_v4_3":
101
+ error_msg = f"CompoundName field does not exist in invitrodb_v4_3 database (this field is only supported in pubchemlite_ccs table of pubchemlite): {error_msg}"
102
+ elif "assay" in error_msg and db_identifier == "pubchemlite":
103
+ error_msg = f"assay table does not exist in pubchemlite database (this table is only supported in invitrodb_v4_3): {error_msg}"
104
+ return jsonify({
105
+ "status": "error",
106
+ "message": f"Database execution error: {error_msg}"
107
+ }), 500
108
+
109
+ # Client issues such as parameter/format errors
110
+ except Exception as e:
111
+ return jsonify({
112
+ "status": "error",
113
+ "message": f"Request processing failed: {str(e)}"
114
+ }), 400
115
+
116
+
117
+ # Root directory route (provides interface usage instructions)
118
+ @app.route('/')
119
+ def home():
120
+ return """
121
+ <h1>SQL Execution API (Supports Dual Databases)</h1>
122
+ <p>Please send POST requests to <code>/execute_sql</code> to execute SQL queries.</p>
123
+ <p>Request parameters:</p>
124
+ <ul>
125
+ <li>db_identifier: Target database (required, optional values: pubchemlite/invitrodb_v4_3)</li>
126
+ <li>sql: SELECT query statement (required, SELECT operations only)</li>
127
+ </ul>
128
+ <p>Example 1 (query pubchemlite):</p>
129
+ <pre>
130
+ curl -X POST http://127.0.0.1:5000/execute_sql \\
131
+ -H "Content-Type: application/json" \\
132
+ -d '{
133
+ "db_identifier": "pubchemlite",
134
+ "sql": "SELECT Identifier, CompoundName, MolecularFormula FROM pubchemlite_ccs WHERE PubMed_Count > 5 LIMIT 10"
135
+ }'
136
+ </pre>
137
+ <p>Example 2 (query invitrodb_v4_3):</p>
138
+ <pre>
139
+ curl -X POST http://127.0.0.1:5000/execute_sql \\
140
+ -H "Content-Type: application/json" \\
141
+ -d '{
142
+ "db_identifier": "invitrodb_v4_3",
143
+ "sql": "SELECT aid, assay_name, organism FROM assay WHERE ncbi_taxon_id = 9606 LIMIT 10"
144
+ }'
145
+ </pre>
146
+ """
147
+
148
+
149
+ # Start the application (listen on all network interfaces, port 5000, debug mode can be turned off)
150
+ if __name__ == '__main__':
151
+ app.run(host='0.0.0.0', port=5000, debug=False)
step2_CECs_annotating_agent_v1.0.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==================== Compound Batch Query Tool (Desktop Version) ====================
2
+ # Supports batch query (AC50 matching function removed)
3
+
4
+ import tkinter as tk
5
+ from tkinter import ttk, filedialog, messagebox, scrolledtext
6
+ import pandas as pd
7
+ import requests
8
+ import json
9
+ import os
10
+ import time
11
+ from typing import Optional, Dict, List
12
+ from datetime import datetime
13
+ import threading
14
+ import sys
15
+
16
+
17
+ # ==================== Core Function Module ====================
18
+ class DifyBasicChat:
19
+ """Dify Basic Chat Function Encapsulation"""
20
+
21
+ def __init__(self, api_key: str, base_url: str = "http://localhost/v1"):
22
+ self.api_key = api_key
23
+ self.base_url = base_url.rstrip("/")
24
+ self.headers = {
25
+ "Authorization": f"Bearer {api_key}",
26
+ "Content-Type": "application/json"
27
+ }
28
+
29
+ def send_message(
30
+ self,
31
+ query: str,
32
+ user: str,
33
+ conversation_id: Optional[str] = None,
34
+ response_mode: str = "blocking",
35
+ inputs: dict = None
36
+ ) -> tuple:
37
+ """Send chat message"""
38
+ url = f"{self.base_url}/chat-messages"
39
+ payload = {
40
+ "query": query,
41
+ "user": user,
42
+ "response_mode": response_mode,
43
+ "inputs": inputs or {}
44
+ }
45
+ if conversation_id:
46
+ payload["conversation_id"] = conversation_id
47
+
48
+ full_response = None
49
+ try:
50
+ if response_mode == "blocking":
51
+ res = requests.post(url, headers=self.headers, json=payload, timeout=120)
52
+ res.raise_for_status()
53
+ full_response = res.json()
54
+ answer = full_response.get("answer", "")
55
+ conv_id = full_response.get("conversation_id")
56
+ return answer, conv_id, full_response
57
+
58
+ else:
59
+ full_answer = ""
60
+ conv_id = None
61
+ res = requests.post(url, headers=self.headers, json=payload, stream=True, timeout=120)
62
+ res.raise_for_status()
63
+ for line in res.iter_lines():
64
+ if line:
65
+ line_data = line.decode("utf-8").lstrip("data: ")
66
+ if line_data:
67
+ try:
68
+ data = json.loads(line_data)
69
+ full_response = data
70
+ if data.get("event") == "message":
71
+ full_answer += data.get("answer", "")
72
+ elif data.get("event") == "message_end":
73
+ conv_id = data.get("conversation_id")
74
+ break
75
+ elif data.get("event") == "error":
76
+ raise Exception(f"Streaming Error: {data.get('message')}")
77
+ except json.JSONDecodeError:
78
+ continue
79
+ return full_answer, conv_id, full_response
80
+
81
+ except requests.exceptions.RequestException as e:
82
+ error_msg = f"Request Failed: {str(e)}"
83
+ return error_msg, None, {"error": error_msg}
84
+
85
+
86
+ def parse_dify_response(answer_text: str) -> dict:
87
+ """Parse classification and complete information returned by Dify"""
88
+ result = {
89
+ #"CASRN": "",
90
+ "Main Category": "",
91
+ "Additional Category 1": "",
92
+ "Additional Category 2": "",
93
+ "EndpointName": [], # Keep for compatibility, no longer used for matching
94
+ "XLogP": "",
95
+ "BioPathway": "",
96
+ "ToxicityInfo": "",
97
+ "KnownUse": "",
98
+ "DisorderDisease": ""
99
+ }
100
+
101
+ try:
102
+ clean_text = answer_text.strip()
103
+ # Clean code block markers
104
+ if clean_text.startswith("```json"):
105
+ clean_text = clean_text.replace("```json", "").replace("```", "").strip()
106
+ elif clean_text.startswith("```"):
107
+ clean_text = clean_text.replace("```", "").strip()
108
+
109
+ # Parse JSON
110
+ response_json = json.loads(clean_text)
111
+
112
+ if isinstance(response_json, dict):
113
+ # Get compound name (first key)
114
+ compound_name = next(iter(response_json.keys())) if response_json else ""
115
+
116
+ if compound_name and isinstance(response_json.get(compound_name), dict):
117
+ # Nested format: {"CompoundName": {...}}
118
+ category_info = response_json[compound_name]
119
+
120
+ # Extract all fields
121
+ #result["CASRN"] = category_info.get("CASRN", "")
122
+ result["Main Category"] = category_info.get("Main Category", "")
123
+ result["Additional Category 1"] = category_info.get("Additional Category 1", "")
124
+ result["Additional Category 2"] = category_info.get("Additional Category 2", "")
125
+
126
+ # Process EndpointName - may be list or string
127
+ endpoint_value = category_info.get("EndpointName", [])
128
+ if isinstance(endpoint_value, list):
129
+ result["EndpointName"] = endpoint_value
130
+ elif isinstance(endpoint_value, str):
131
+ result["EndpointName"] = [endpoint_value] if endpoint_value else []
132
+
133
+ result["XLogP"] = category_info.get("XLogP", "")
134
+ result["BioPathway"] = category_info.get("BioPathway", "")
135
+ result["ToxicityInfo"] = category_info.get("ToxicityInfo", "")
136
+ result["KnownUse"] = category_info.get("KnownUse", "")
137
+ result["DisorderDisease"] = category_info.get("DisorderDisease", "")
138
+
139
+ else:
140
+ # Flat format (compatible with old format)
141
+ result["Main Category"] = response_json.get("Main Category", "")
142
+ result["Additional Category 1"] = response_json.get("Additional Category 1", "")
143
+ result["Additional Category 2"] = response_json.get("Additional Category 2", "")
144
+
145
+ except json.JSONDecodeError as e:
146
+ result["Main Category"] = f"JSON Parsing Error: {str(e)}"
147
+ except Exception as e:
148
+ result["Main Category"] = f"Parsing Failed: {str(e)}"
149
+
150
+ return result
151
+
152
+
153
+ def normalize_compound_name(name: str) -> str:
154
+ """Normalize compound name (remove quotes, etc.)"""
155
+ if not isinstance(name, str):
156
+ return ""
157
+
158
+ # Remove quotes
159
+ name = name.strip()
160
+ if name.startswith('"') and name.endswith('"'):
161
+ name = name[1:-1]
162
+ elif name.startswith("'") and name.endswith("'"):
163
+ name = name[1:-1]
164
+
165
+ # Remove extra spaces
166
+ name = ' '.join(name.split())
167
+
168
+ return name
169
+
170
+
171
+ def expand_endpoint_rows(parsed_result: dict, compound_name: str) -> list:
172
+ """
173
+ Expand EndpointName into multiple rows (without AC50 matching)
174
+ """
175
+ rows = []
176
+
177
+ # Normalize compound name
178
+ compound_clean = normalize_compound_name(compound_name)
179
+
180
+ endpoint_names = parsed_result.get("EndpointName", [])
181
+
182
+ if not endpoint_names:
183
+ # Create one row if no EndpointName
184
+ row = {
185
+ "CompoundName": compound_clean,
186
+ "OriginalCompoundName": compound_name,
187
+ #"CASRN": parsed_result.get("CASRN", ""),
188
+ "MainCategory": parsed_result.get("Main Category", ""),
189
+ "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
190
+ "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
191
+ "EndpointName": "",
192
+ "XLogP": parsed_result.get("XLogP", ""),
193
+ "BioPathway": parsed_result.get("BioPathway", ""),
194
+ "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
195
+ "KnownUse": parsed_result.get("KnownUse", ""),
196
+ "DisorderDisease": parsed_result.get("DisorderDisease", "")
197
+ }
198
+ rows.append(row)
199
+ else:
200
+ # Create one row per endpoint (without AC50 matching)
201
+ for endpoint in endpoint_names:
202
+ row = {
203
+ "CompoundName": compound_clean,
204
+ "OriginalCompoundName": compound_name,
205
+ #"CASRN": parsed_result.get("CASRN", ""),
206
+ "MainCategory": parsed_result.get("Main Category", ""),
207
+ "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
208
+ "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
209
+ "EndpointName": endpoint,
210
+ "XLogP": parsed_result.get("XLogP", ""),
211
+ "BioPathway": parsed_result.get("BioPathway", ""),
212
+ "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
213
+ "KnownUse": parsed_result.get("KnownUse", ""),
214
+ "DisorderDisease": parsed_result.get("DisorderDisease", "")
215
+ }
216
+ rows.append(row)
217
+
218
+ return rows
219
+
220
+
221
+ def batch_process_compounds_gui(
222
+ csv_path: str,
223
+ save_root: str,
224
+ api_key: str,
225
+ base_url: str,
226
+ log_text: tk.Text,
227
+ progress_var: tk.DoubleVar,
228
+ user_id: str = "batch_compound_user",
229
+ compound_col: str = "IUPAC_name",
230
+ batch_num: int = 1,
231
+ csv_encoding: str = "utf-8",
232
+ csv_sep: str = ","
233
+ ):
234
+ """Batch process compounds (adapted for GUI, AC50 matching removed)"""
235
+
236
+ def log(message, color="black"):
237
+ """Output log to GUI text box"""
238
+ log_text.config(state=tk.NORMAL)
239
+ log_text.insert(tk.END, f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}\n")
240
+ log_text.tag_add(color, f"end-2l", f"end-1l")
241
+ log_text.tag_config(color, foreground=color)
242
+ log_text.see(tk.END)
243
+ log_text.config(state=tk.DISABLED)
244
+ log_text.update()
245
+
246
+ try:
247
+ # Initialize Dify client
248
+ log("Initializing Dify connection...", "blue")
249
+ chat = DifyBasicChat(api_key=api_key, base_url=base_url)
250
+
251
+ # Create save folder
252
+ result_folder = os.path.join(save_root,
253
+ f"Compound_Classification_Results_Batch{batch_num}_{datetime.now().strftime('%Y%m%d%H%M%S')}")
254
+ os.makedirs(result_folder, exist_ok=True)
255
+ log(f"Result save folder: {result_folder}", "blue")
256
+
257
+ # Read CSV
258
+ log("Reading CSV file...", "blue")
259
+ df = pd.read_csv(
260
+ csv_path,
261
+ encoding=csv_encoding,
262
+ sep=csv_sep,
263
+ na_filter=True
264
+ )
265
+ df = df.reset_index(drop=True)
266
+
267
+ # Check if column exists
268
+ if compound_col not in df.columns:
269
+ raise ValueError(
270
+ f"Column not found in CSV: [{compound_col}]\n"
271
+ f"Current CSV columns: {list(df.columns)}"
272
+ )
273
+
274
+ # Remove duplicates and empty values
275
+ compounds = df[compound_col].dropna().unique()
276
+ total = len(compounds)
277
+ log(f"Successfully read {total} non-empty and unique compound names", "green")
278
+
279
+ all_rows = [] # Store all row data
280
+ failed_list = []
281
+
282
+ # Batch processing
283
+ for idx, compound in enumerate(compounds, 1):
284
+ compound = str(compound).strip()
285
+ if not compound:
286
+ continue
287
+
288
+ # Update progress
289
+ progress = (idx / total) * 100
290
+ progress_var.set(progress)
291
+ log(f"Processing {idx}/{total}:{compound}", "black")
292
+
293
+ try:
294
+ # Call Dify API
295
+ answer, _, full_response = chat.send_message(
296
+ query=compound,
297
+ user=f"{user_id}_batch{batch_num}",
298
+ response_mode="blocking"
299
+ )
300
+
301
+ # Parse results
302
+ parsed_categories = parse_dify_response(answer)
303
+
304
+ # Expand EndpointName into multiple rows (without AC50 matching)
305
+ expanded_rows = expand_endpoint_rows(parsed_categories, compound)
306
+ all_rows.extend(expanded_rows)
307
+
308
+ # Save original record (for debugging)
309
+ record_file = os.path.join(result_folder, f"Original_Record_{idx}.json")
310
+ with open(record_file, "w", encoding="utf-8") as f:
311
+ json.dump({
312
+ "Input Compound": compound,
313
+ "Dify Original Response": answer,
314
+ "Complete Response": full_response,
315
+ "Parsed Classification": parsed_categories,
316
+ "Expanded Rows Count": len(expanded_rows),
317
+ "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
318
+ }, f, ensure_ascii=False, indent=4)
319
+
320
+ log(f"✅ Processing completed: {compound} | Main Category: {parsed_categories['Main Category']} | Generated {len(expanded_rows)} rows",
321
+ "green")
322
+ time.sleep(0.5) # Avoid too fast requests
323
+
324
+ except Exception as e:
325
+ error_msg = str(e)
326
+ log(f"❌ Processing failed: {compound} | Error: {error_msg}", "red")
327
+ failed_list.append({
328
+ "CompoundName": normalize_compound_name(compound),
329
+ "OriginalCompoundName": compound,
330
+ #"CASRN": "",
331
+ "MainCategory": f"Processing Failed: {error_msg}",
332
+ "AdditionalCategory1": "",
333
+ "AdditionalCategory2": "",
334
+ "EndpointName": "",
335
+ "XLogP": "",
336
+ "BioPathway": "",
337
+ "ToxicityInfo": "",
338
+ "KnownUse": "",
339
+ "DisorderDisease": ""
340
+ })
341
+
342
+ # Merge results and save
343
+ result_df = pd.DataFrame(all_rows)
344
+
345
+ # Add failed records
346
+ if failed_list:
347
+ failed_df = pd.DataFrame(failed_list)
348
+ result_df = pd.concat([result_df, failed_df], ignore_index=True)
349
+
350
+ # Define column order
351
+ column_order = [
352
+ "CompoundName",
353
+ "OriginalCompoundName",
354
+ #"CASRN",
355
+ "MainCategory",
356
+ "AdditionalCategory1",
357
+ "AdditionalCategory2",
358
+ "EndpointName",
359
+ "XLogP",
360
+ "BioPathway",
361
+ "ToxicityInfo",
362
+ "KnownUse",
363
+ "DisorderDisease"
364
+ ]
365
+
366
+ # Ensure all columns exist
367
+ for col in column_order:
368
+ if col not in result_df.columns:
369
+ result_df[col] = ""
370
+
371
+ # Reorder columns
372
+ result_df = result_df.reindex(columns=column_order)
373
+
374
+ # Save final CSV
375
+ csv_filename = f"Compound_Query_Results_Batch{batch_num}.csv"
376
+ csv_path_out = os.path.join(result_folder, csv_filename)
377
+ result_df.to_csv(csv_path_out, index=False, encoding="utf-8-sig")
378
+ log(f"📄 Result file saved to: {csv_path_out}", "blue")
379
+ log(f"📊 Total Rows: {len(result_df)} rows", "blue")
380
+
381
+ # Save failed list (separate file)
382
+ if failed_list:
383
+ fail_file = os.path.join(result_folder, f"Failed_List_Batch{batch_num}.csv")
384
+ pd.DataFrame(failed_list).to_csv(fail_file, index=False, encoding="utf-8-sig")
385
+ log(f"❌ {len(failed_list)} compounds failed to process, details: {fail_file}", "red")
386
+
387
+ # Update progress and log after completion
388
+ progress_var.set(100)
389
+ log(f"\n{'=' * 40}", "blue")
390
+ log(f"🏁 Processing Complete!", "green")
391
+ log(f"{'=' * 40}", "blue")
392
+ log(f"📊 Statistics: Total Compounds={total} | Successful Rows={len(all_rows)} | Failed Compounds={len(failed_list)}", "blue")
393
+ log(f"📁 All results saved to: {result_folder}", "blue")
394
+
395
+ # Ask if open result folder
396
+ if messagebox.askyesno("Processing Complete", f"Batch processing completed!\nTotal {len(result_df)} rows of data generated\nOpen result folder?"):
397
+ if os.name == 'nt': # Windows
398
+ os.startfile(result_folder)
399
+ elif os.name == 'posix': # macOS, Linux
400
+ import subprocess
401
+ try:
402
+ if sys.platform == 'darwin':
403
+ subprocess.run(['open', result_folder])
404
+ else:
405
+ subprocess.run(['xdg-open', result_folder])
406
+ except:
407
+ pass
408
+
409
+ except Exception as e:
410
+ log(f"❌ Overall processing failed: {str(e)}", "red")
411
+ messagebox.showerror("Error", f"Processing failed: {str(e)}")
412
+ finally:
413
+ # Reset progress
414
+ progress_var.set(0)
415
+
416
+
417
+ # ==================== Graphical User Interface Module ====================
418
+ class CompoundBatchToolGUI:
419
+ def __init__(self, root):
420
+ self.root = root
421
+ self.root.title("Compound Batch Query Tool v1.0")
422
+ self.root.geometry("850x700")
423
+ self.root.resizable(True, True)
424
+
425
+ # Default configuration
426
+ self.default_api_key = "app-QRGuoLVqSksMsG4t9O53cITj"
427
+ self.default_base_url = "http://192.168.0.179:8080/v1"
428
+ self.default_save_root = "./Compound_Query_Results"
429
+ self.default_compound_col = "IUPAC_name"
430
+ self.default_csv_encoding = "utf-8"
431
+ self.default_csv_sep = ","
432
+
433
+ # Create main frame
434
+ main_frame = ttk.Frame(root, padding="20")
435
+ main_frame.pack(fill=tk.BOTH, expand=True)
436
+
437
+ # 1. File selection area
438
+ file_frame = ttk.LabelFrame(main_frame, text="1. Select CSV File", padding="10")
439
+ file_frame.pack(fill=tk.X, pady=5)
440
+
441
+ self.csv_path_var = tk.StringVar()
442
+ ttk.Entry(file_frame, textvariable=self.csv_path_var, state="readonly", width=65).grid(row=0, column=1, padx=5,
443
+ pady=5)
444
+ ttk.Button(file_frame, text="Select File", command=self.select_csv_file).grid(row=0, column=0, padx=5, pady=5)
445
+
446
+ # 2. Parameter configuration area
447
+ param_frame = ttk.LabelFrame(main_frame, text="2. Parameter Configuration", padding="10")
448
+ param_frame.pack(fill=tk.X, pady=5)
449
+
450
+ # 2.1 Dify configuration
451
+ ttk.Label(param_frame, text="Dify API Key:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=3)
452
+ self.api_key_var = tk.StringVar(value=self.default_api_key)
453
+ ttk.Entry(param_frame, textvariable=self.api_key_var, width=60).grid(row=0, column=1, columnspan=3, padx=5,
454
+ pady=3)
455
+
456
+ ttk.Label(param_frame, text="Dify URL:").grid(row=1, column=0, sticky=tk.W, padx=5, pady=3)
457
+ self.base_url_var = tk.StringVar(value=self.default_base_url)
458
+ ttk.Entry(param_frame, textvariable=self.base_url_var, width=60).grid(row=1, column=1, columnspan=3, padx=5,
459
+ pady=3)
460
+
461
+ # 2.2 CSV configuration
462
+ ttk.Label(param_frame, text="Compound Column Name:").grid(row=2, column=0, sticky=tk.W, padx=5, pady=3)
463
+ self.compound_col_var = tk.StringVar(value=self.default_compound_col)
464
+ ttk.Entry(param_frame, textvariable=self.compound_col_var, width=20).grid(row=2, column=1, padx=5, pady=3)
465
+
466
+ ttk.Label(param_frame, text="CSV Encoding:").grid(row=2, column=2, sticky=tk.W, padx=5, pady=3)
467
+ self.csv_encoding_var = tk.StringVar(value=self.default_csv_encoding)
468
+ ttk.Entry(param_frame, textvariable=self.csv_encoding_var, width=15).grid(row=2, column=3, padx=5, pady=3)
469
+
470
+ ttk.Label(param_frame, text="CSV Separator:").grid(row=3, column=0, sticky=tk.W, padx=5, pady=3)
471
+ self.csv_sep_var = tk.StringVar(value=self.default_csv_sep)
472
+ ttk.Entry(param_frame, textvariable=self.csv_sep_var, width=20).grid(row=3, column=1, padx=5, pady=3)
473
+
474
+ # 2.3 Save configuration (AC50 folder removed)
475
+ ttk.Label(param_frame, text="Result Save Path:").grid(row=4, column=0, sticky=tk.W, padx=5, pady=3)
476
+ self.save_root_var = tk.StringVar(value=self.default_save_root)
477
+ ttk.Entry(param_frame, textvariable=self.save_root_var, width=50).grid(row=4, column=1, columnspan=2, padx=5,
478
+ pady=3)
479
+ ttk.Button(param_frame, text="Select Path", command=self.select_save_root).grid(row=4, column=3, padx=5, pady=3)
480
+
481
+ # 3. Operation area
482
+ op_frame = ttk.LabelFrame(main_frame, text="3. Start Processing", padding="10")
483
+ op_frame.pack(fill=tk.X, pady=5)
484
+
485
+ self.progress_var = tk.DoubleVar()
486
+ progress_bar = ttk.Progressbar(op_frame, variable=self.progress_var, maximum=100)
487
+ progress_bar.pack(fill=tk.X, padx=5, pady=5)
488
+
489
+ self.start_btn = ttk.Button(op_frame, text="Start Batch Processing", command=self.start_processing)
490
+ self.start_btn.pack(pady=5)
491
+
492
+ # 4. Log output area
493
+ log_frame = ttk.LabelFrame(main_frame, text="4. Processing Log", padding="10")
494
+ log_frame.pack(fill=tk.BOTH, expand=True, pady=5)
495
+
496
+ self.log_text = scrolledtext.ScrolledText(log_frame, wrap=tk.WORD, state=tk.DISABLED)
497
+ self.log_text.pack(fill=tk.BOTH, expand=True)
498
+ # Set log color tags
499
+ self.log_text.tag_config("red", foreground="red")
500
+ self.log_text.tag_config("green", foreground="green")
501
+ self.log_text.tag_config("blue", foreground="blue")
502
+ self.log_text.tag_config("orange", foreground="orange")
503
+ self.log_text.tag_config("gray", foreground="gray")
504
+
505
+ # 5. Bottom tip (AC50 related tip removed)
506
+ tip_label = ttk.Label(main_frame,
507
+ text="Tip: Each endpoint returned by Dify generates a separate row in the result",
508
+ foreground="gray")
509
+ tip_label.pack(side=tk.BOTTOM, pady=10)
510
+
511
+ def select_csv_file(self):
512
+ """Select CSV file"""
513
+ file_path = filedialog.askopenfilename(
514
+ title="Select Compound CSV File",
515
+ filetypes=[("CSV Files", "*.csv"), ("All Files", "*.*")]
516
+ )
517
+ if file_path:
518
+ self.csv_path_var.set(file_path)
519
+
520
+ def select_save_root(self):
521
+ """Select save path"""
522
+ folder_path = filedialog.askdirectory(title="Select Result Save Folder")
523
+ if folder_path:
524
+ self.save_root_var.set(folder_path)
525
+
526
+ def start_processing(self):
527
+ """Start batch processing (new thread to avoid UI freezing)"""
528
+ # Verify required parameters
529
+ csv_path = self.csv_path_var.get()
530
+ if not csv_path:
531
+ messagebox.showwarning("Warning", "Please select a CSV file first!")
532
+ return
533
+
534
+ api_key = self.api_key_var.get().strip()
535
+ if not api_key:
536
+ messagebox.showwarning("Warning", "Please fill in the Dify API Key!")
537
+ return
538
+
539
+ base_url = self.base_url_var.get().strip()
540
+ if not base_url:
541
+ messagebox.showwarning("Warning", "Please fill in the Dify URL!")
542
+ return
543
+
544
+ # Disable start button to prevent duplicate clicks
545
+ self.start_btn.config(state=tk.DISABLED)
546
+
547
+ # Clear log
548
+ self.log_text.config(state=tk.NORMAL)
549
+ self.log_text.delete(1.0, tk.END)
550
+ self.log_text.config(state=tk.DISABLED)
551
+
552
+ # New thread for processing (avoid UI freezing)
553
+ def process_thread():
554
+ try:
555
+ batch_process_compounds_gui(
556
+ csv_path=csv_path,
557
+ save_root=self.save_root_var.get(),
558
+ api_key=api_key,
559
+ base_url=base_url,
560
+ log_text=self.log_text,
561
+ progress_var=self.progress_var,
562
+ compound_col=self.compound_col_var.get(),
563
+ csv_encoding=self.csv_encoding_var.get(),
564
+ csv_sep=self.csv_sep_var.get()
565
+ )
566
+ finally:
567
+ # Restore button state
568
+ self.start_btn.config(state=tk.NORMAL)
569
+
570
+ threading.Thread(target=process_thread, daemon=True).start()
571
+
572
+
573
+ # ==================== Start Program ====================
574
+ if __name__ == "__main__":
575
+ # Normal GUI startup (AC50 debug mode removed)
576
+ root = tk.Tk()
577
+ app = CompoundBatchToolGUI(root)
578
+ root.mainloop()