Qianhui19
/

CECs_annotating_agent

agent

chemistry

environment

Model card Files Files and versions

xet

Community

Qianhui19 commited on 23 days ago

Commit

caf4896

verified ·

1 Parent(s): 22e7ff6

Delete step2_CECs_annotating_agent_v1.0.py

Browse files

Files changed (1) hide show

step2_CECs_annotating_agent_v1.0.py +0 -578

step2_CECs_annotating_agent_v1.0.py DELETED Viewed

@@ -1,578 +0,0 @@
-# ==================== Compound Batch Query Tool (Desktop Version) ====================
-# Supports batch query (AC50 matching function removed)
-import tkinter as tk
-from tkinter import ttk, filedialog, messagebox, scrolledtext
-import pandas as pd
-import requests
-import json
-import os
-import time
-from typing import Optional, Dict, List
-from datetime import datetime
-import threading
-import sys
-# ==================== Core Function Module ====================
-class DifyBasicChat:
-    """Dify Basic Chat Function Encapsulation"""
-    def __init__(self, api_key: str, base_url: str = "http://localhost/v1"):
-        self.api_key = api_key
-        self.base_url = base_url.rstrip("/")
-        self.headers = {
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": "application/json"
-        }
-    def send_message(
-            self,
-            query: str,
-            user: str,
-            conversation_id: Optional[str] = None,
-            response_mode: str = "blocking",
-            inputs: dict = None
-    ) -> tuple:
-        """Send chat message"""
-        url = f"{self.base_url}/chat-messages"
-        payload = {
-            "query": query,
-            "user": user,
-            "response_mode": response_mode,
-            "inputs": inputs or {}
-        }
-        if conversation_id:
-            payload["conversation_id"] = conversation_id
-        full_response = None
-        try:
-            if response_mode == "blocking":
-                res = requests.post(url, headers=self.headers, json=payload, timeout=120)
-                res.raise_for_status()
-                full_response = res.json()
-                answer = full_response.get("answer", "")
-                conv_id = full_response.get("conversation_id")
-                return answer, conv_id, full_response
-            else:
-                full_answer = ""
-                conv_id = None
-                res = requests.post(url, headers=self.headers, json=payload, stream=True, timeout=120)
-                res.raise_for_status()
-                for line in res.iter_lines():
-                    if line:
-                        line_data = line.decode("utf-8").lstrip("data: ")
-                        if line_data:
-                            try:
-                                data = json.loads(line_data)
-                                full_response = data
-                                if data.get("event") == "message":
-                                    full_answer += data.get("answer", "")
-                                elif data.get("event") == "message_end":
-                                    conv_id = data.get("conversation_id")
-                                    break
-                                elif data.get("event") == "error":
-                                    raise Exception(f"Streaming Error: {data.get('message')}")
-                            except json.JSONDecodeError:
-                                continue
-                return full_answer, conv_id, full_response
-        except requests.exceptions.RequestException as e:
-            error_msg = f"Request Failed: {str(e)}"
-            return error_msg, None, {"error": error_msg}
-def parse_dify_response(answer_text: str) -> dict:
-    """Parse classification and complete information returned by Dify"""
-    result = {
-        #"CASRN": "",
-        "Main Category": "",
-        "Additional Category 1": "",
-        "Additional Category 2": "",
-        "EndpointName": [],  # Keep for compatibility, no longer used for matching
-        "XLogP": "",
-        "BioPathway": "",
-        "ToxicityInfo": "",
-        "KnownUse": "",
-        "DisorderDisease": ""
-    }
-    try:
-        clean_text = answer_text.strip()
-        # Clean code block markers
-        if clean_text.startswith("```json"):
-            clean_text = clean_text.replace("```json", "").replace("```", "").strip()
-        elif clean_text.startswith("```"):
-            clean_text = clean_text.replace("```", "").strip()
-        # Parse JSON
-        response_json = json.loads(clean_text)
-        if isinstance(response_json, dict):
-            # Get compound name (first key)
-            compound_name = next(iter(response_json.keys())) if response_json else ""
-            if compound_name and isinstance(response_json.get(compound_name), dict):
-                # Nested format: {"CompoundName": {...}}
-                category_info = response_json[compound_name]
-                # Extract all fields
-                #result["CASRN"] = category_info.get("CASRN", "")
-                result["Main Category"] = category_info.get("Main Category", "")
-                result["Additional Category 1"] = category_info.get("Additional Category 1", "")
-                result["Additional Category 2"] = category_info.get("Additional Category 2", "")
-                # Process EndpointName - may be list or string
-                endpoint_value = category_info.get("EndpointName", [])
-                if isinstance(endpoint_value, list):
-                    result["EndpointName"] = endpoint_value
-                elif isinstance(endpoint_value, str):
-                    result["EndpointName"] = [endpoint_value] if endpoint_value else []
-                result["XLogP"] = category_info.get("XLogP", "")
-                result["BioPathway"] = category_info.get("BioPathway", "")
-                result["ToxicityInfo"] = category_info.get("ToxicityInfo", "")
-                result["KnownUse"] = category_info.get("KnownUse", "")
-                result["DisorderDisease"] = category_info.get("DisorderDisease", "")
-            else:
-                # Flat format (compatible with old format)
-                result["Main Category"] = response_json.get("Main Category", "")
-                result["Additional Category 1"] = response_json.get("Additional Category 1", "")
-                result["Additional Category 2"] = response_json.get("Additional Category 2", "")
-    except json.JSONDecodeError as e:
-        result["Main Category"] = f"JSON Parsing Error: {str(e)}"
-    except Exception as e:
-        result["Main Category"] = f"Parsing Failed: {str(e)}"
-    return result
-def normalize_compound_name(name: str) -> str:
-    """Normalize compound name (remove quotes, etc.)"""
-    if not isinstance(name, str):
-        return ""
-    # Remove quotes
-    name = name.strip()
-    if name.startswith('"') and name.endswith('"'):
-        name = name[1:-1]
-    elif name.startswith("'") and name.endswith("'"):
-        name = name[1:-1]
-    # Remove extra spaces
-    name = ' '.join(name.split())
-    return name
-def expand_endpoint_rows(parsed_result: dict, compound_name: str) -> list:
-    """
-    Expand EndpointName into multiple rows (without AC50 matching)
-    """
-    rows = []
-    # Normalize compound name
-    compound_clean = normalize_compound_name(compound_name)
-    endpoint_names = parsed_result.get("EndpointName", [])
-    if not endpoint_names:
-        # Create one row if no EndpointName
-        row = {
-            "CompoundName": compound_clean,
-            "OriginalCompoundName": compound_name,
-            #"CASRN": parsed_result.get("CASRN", ""),
-            "MainCategory": parsed_result.get("Main Category", ""),
-            "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
-            "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
-            "EndpointName": "",
-            "XLogP": parsed_result.get("XLogP", ""),
-            "BioPathway": parsed_result.get("BioPathway", ""),
-            "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
-            "KnownUse": parsed_result.get("KnownUse", ""),
-            "DisorderDisease": parsed_result.get("DisorderDisease", "")
-        }
-        rows.append(row)
-    else:
-        # Create one row per endpoint (without AC50 matching)
-        for endpoint in endpoint_names:
-            row = {
-                "CompoundName": compound_clean,
-                "OriginalCompoundName": compound_name,
-                #"CASRN": parsed_result.get("CASRN", ""),
-                "MainCategory": parsed_result.get("Main Category", ""),
-                "AdditionalCategory1": parsed_result.get("Additional Category 1", ""),
-                "AdditionalCategory2": parsed_result.get("Additional Category 2", ""),
-                "EndpointName": endpoint,
-                "XLogP": parsed_result.get("XLogP", ""),
-                "BioPathway": parsed_result.get("BioPathway", ""),
-                "ToxicityInfo": parsed_result.get("ToxicityInfo", ""),
-                "KnownUse": parsed_result.get("KnownUse", ""),
-                "DisorderDisease": parsed_result.get("DisorderDisease", "")
-            }
-            rows.append(row)
-    return rows
-def batch_process_compounds_gui(
-        csv_path: str,
-        save_root: str,
-        api_key: str,
-        base_url: str,
-        log_text: tk.Text,
-        progress_var: tk.DoubleVar,
-        user_id: str = "batch_compound_user",
-        compound_col: str = "IUPAC_name",
-        batch_num: int = 1,
-        csv_encoding: str = "utf-8",
-        csv_sep: str = ","
-):
-    """Batch process compounds (adapted for GUI, AC50 matching removed)"""
-    def log(message, color="black"):
-        """Output log to GUI text box"""
-        log_text.config(state=tk.NORMAL)
-        log_text.insert(tk.END, f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}\n")
-        log_text.tag_add(color, f"end-2l", f"end-1l")
-        log_text.tag_config(color, foreground=color)
-        log_text.see(tk.END)
-        log_text.config(state=tk.DISABLED)
-        log_text.update()
-    try:
-        # Initialize Dify client
-        log("Initializing Dify connection...", "blue")
-        chat = DifyBasicChat(api_key=api_key, base_url=base_url)
-        # Create save folder
-        result_folder = os.path.join(save_root,
-                                     f"Compound_Classification_Results_Batch{batch_num}_{datetime.now().strftime('%Y%m%d%H%M%S')}")
-        os.makedirs(result_folder, exist_ok=True)
-        log(f"Result save folder: {result_folder}", "blue")
-        # Read CSV
-        log("Reading CSV file...", "blue")
-        df = pd.read_csv(
-            csv_path,
-            encoding=csv_encoding,
-            sep=csv_sep,
-            na_filter=True
-        )
-        df = df.reset_index(drop=True)
-        # Check if column exists
-        if compound_col not in df.columns:
-            raise ValueError(
-                f"Column not found in CSV: [{compound_col}]\n"
-                f"Current CSV columns: {list(df.columns)}"
-            )
-        # Remove duplicates and empty values
-        compounds = df[compound_col].dropna().unique()
-        total = len(compounds)
-        log(f"Successfully read {total} non-empty and unique compound names", "green")
-        all_rows = []  # Store all row data
-        failed_list = []
-        # Batch processing
-        for idx, compound in enumerate(compounds, 1):
-            compound = str(compound).strip()
-            if not compound:
-                continue
-            # Update progress
-            progress = (idx / total) * 100
-            progress_var.set(progress)
-            log(f"Processing {idx}/{total}：{compound}", "black")
-            try:
-                # Call Dify API
-                answer, _, full_response = chat.send_message(
-                    query=compound,
-                    user=f"{user_id}_batch{batch_num}",
-                    response_mode="blocking"
-                )
-                # Parse results
-                parsed_categories = parse_dify_response(answer)
-                # Expand EndpointName into multiple rows (without AC50 matching)
-                expanded_rows = expand_endpoint_rows(parsed_categories, compound)
-                all_rows.extend(expanded_rows)
-                # Save original record (for debugging)
-                record_file = os.path.join(result_folder, f"Original_Record_{idx}.json")
-                with open(record_file, "w", encoding="utf-8") as f:
-                    json.dump({
-                        "Input Compound": compound,
-                        "Dify Original Response": answer,
-                        "Complete Response": full_response,
-                        "Parsed Classification": parsed_categories,
-                        "Expanded Rows Count": len(expanded_rows),
-                        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }, f, ensure_ascii=False, indent=4)
-                log(f"✅ Processing completed: {compound} | Main Category: {parsed_categories['Main Category']} | Generated {len(expanded_rows)} rows",
-                    "green")
-                time.sleep(0.5)  # Avoid too fast requests
-            except Exception as e:
-                error_msg = str(e)
-                log(f"❌ Processing failed: {compound} | Error: {error_msg}", "red")
-                failed_list.append({
-                    "CompoundName": normalize_compound_name(compound),
-                    "OriginalCompoundName": compound,
-                    #"CASRN": "",
-                    "MainCategory": f"Processing Failed: {error_msg}",
-                    "AdditionalCategory1": "",
-                    "AdditionalCategory2": "",
-                    "EndpointName": "",
-                    "XLogP": "",
-                    "BioPathway": "",
-                    "ToxicityInfo": "",
-                    "KnownUse": "",
-                    "DisorderDisease": ""
-                })
-        # Merge results and save
-        result_df = pd.DataFrame(all_rows)
-        # Add failed records
-        if failed_list:
-            failed_df = pd.DataFrame(failed_list)
-            result_df = pd.concat([result_df, failed_df], ignore_index=True)
-        # Define column order
-        column_order = [
-            "CompoundName",
-            "OriginalCompoundName",
-            #"CASRN",
-            "MainCategory",
-            "AdditionalCategory1",
-            "AdditionalCategory2",
-            "EndpointName",
-            "XLogP",
-            "BioPathway",
-            "ToxicityInfo",
-            "KnownUse",
-            "DisorderDisease"
-        ]
-        # Ensure all columns exist
-        for col in column_order:
-            if col not in result_df.columns:
-                result_df[col] = ""
-        # Reorder columns
-        result_df = result_df.reindex(columns=column_order)
-        # Save final CSV
-        csv_filename = f"Compound_Query_Results_Batch{batch_num}.csv"
-        csv_path_out = os.path.join(result_folder, csv_filename)
-        result_df.to_csv(csv_path_out, index=False, encoding="utf-8-sig")
-        log(f"📄 Result file saved to: {csv_path_out}", "blue")
-        log(f"📊 Total Rows: {len(result_df)} rows", "blue")
-        # Save failed list (separate file)
-        if failed_list:
-            fail_file = os.path.join(result_folder, f"Failed_List_Batch{batch_num}.csv")
-            pd.DataFrame(failed_list).to_csv(fail_file, index=False, encoding="utf-8-sig")
-            log(f"❌ {len(failed_list)} compounds failed to process, details: {fail_file}", "red")
-        # Update progress and log after completion
-        progress_var.set(100)
-        log(f"\n{'=' * 40}", "blue")
-        log(f"🏁 Processing Complete!", "green")
-        log(f"{'=' * 40}", "blue")
-        log(f"📊 Statistics: Total Compounds={total} | Successful Rows={len(all_rows)} | Failed Compounds={len(failed_list)}", "blue")
-        log(f"📁 All results saved to: {result_folder}", "blue")
-        # Ask if open result folder
-        if messagebox.askyesno("Processing Complete", f"Batch processing completed!\nTotal {len(result_df)} rows of data generated\nOpen result folder?"):
-            if os.name == 'nt':  # Windows
-                os.startfile(result_folder)
-            elif os.name == 'posix':  # macOS, Linux
-                import subprocess
-                try:
-                    if sys.platform == 'darwin':
-                        subprocess.run(['open', result_folder])
-                    else:
-                        subprocess.run(['xdg-open', result_folder])
-                except:
-                    pass
-    except Exception as e:
-        log(f"❌ Overall processing failed: {str(e)}", "red")
-        messagebox.showerror("Error", f"Processing failed: {str(e)}")
-    finally:
-        # Reset progress
-        progress_var.set(0)
-# ==================== Graphical User Interface Module ====================
-class CompoundBatchToolGUI:
-    def __init__(self, root):
-        self.root = root
-        self.root.title("Compound Batch Query Tool v1.0")
-        self.root.geometry("850x700")
-        self.root.resizable(True, True)
-        # Default configuration
-        self.default_api_key = "app-QRGuoLVqSksMsG4t9O53cITj"
-        self.default_base_url = "http://192.168.0.179:8080/v1"
-        self.default_save_root = "./Compound_Query_Results"
-        self.default_compound_col = "IUPAC_name"
-        self.default_csv_encoding = "utf-8"
-        self.default_csv_sep = ","
-        # Create main frame
-        main_frame = ttk.Frame(root, padding="20")
-        main_frame.pack(fill=tk.BOTH, expand=True)
-        # 1. File selection area
-        file_frame = ttk.LabelFrame(main_frame, text="1. Select CSV File", padding="10")
-        file_frame.pack(fill=tk.X, pady=5)
-        self.csv_path_var = tk.StringVar()
-        ttk.Entry(file_frame, textvariable=self.csv_path_var, state="readonly", width=65).grid(row=0, column=1, padx=5,
-                                                                                               pady=5)
-        ttk.Button(file_frame, text="Select File", command=self.select_csv_file).grid(row=0, column=0, padx=5, pady=5)
-        # 2. Parameter configuration area
-        param_frame = ttk.LabelFrame(main_frame, text="2. Parameter Configuration", padding="10")
-        param_frame.pack(fill=tk.X, pady=5)
-        # 2.1 Dify configuration
-        ttk.Label(param_frame, text="Dify API Key：").grid(row=0, column=0, sticky=tk.W, padx=5, pady=3)
-        self.api_key_var = tk.StringVar(value=self.default_api_key)
-        ttk.Entry(param_frame, textvariable=self.api_key_var, width=60).grid(row=0, column=1, columnspan=3, padx=5,
-                                                                             pady=3)
-        ttk.Label(param_frame, text="Dify URL：").grid(row=1, column=0, sticky=tk.W, padx=5, pady=3)
-        self.base_url_var = tk.StringVar(value=self.default_base_url)
-        ttk.Entry(param_frame, textvariable=self.base_url_var, width=60).grid(row=1, column=1, columnspan=3, padx=5,
-                                                                              pady=3)
-        # 2.2 CSV configuration
-        ttk.Label(param_frame, text="Compound Column Name：").grid(row=2, column=0, sticky=tk.W, padx=5, pady=3)
-        self.compound_col_var = tk.StringVar(value=self.default_compound_col)
-        ttk.Entry(param_frame, textvariable=self.compound_col_var, width=20).grid(row=2, column=1, padx=5, pady=3)
-        ttk.Label(param_frame, text="CSV Encoding：").grid(row=2, column=2, sticky=tk.W, padx=5, pady=3)
-        self.csv_encoding_var = tk.StringVar(value=self.default_csv_encoding)
-        ttk.Entry(param_frame, textvariable=self.csv_encoding_var, width=15).grid(row=2, column=3, padx=5, pady=3)
-        ttk.Label(param_frame, text="CSV Separator：").grid(row=3, column=0, sticky=tk.W, padx=5, pady=3)
-        self.csv_sep_var = tk.StringVar(value=self.default_csv_sep)
-        ttk.Entry(param_frame, textvariable=self.csv_sep_var, width=20).grid(row=3, column=1, padx=5, pady=3)
-        # 2.3 Save configuration (AC50 folder removed)
-        ttk.Label(param_frame, text="Result Save Path：").grid(row=4, column=0, sticky=tk.W, padx=5, pady=3)
-        self.save_root_var = tk.StringVar(value=self.default_save_root)
-        ttk.Entry(param_frame, textvariable=self.save_root_var, width=50).grid(row=4, column=1, columnspan=2, padx=5,
-                                                                               pady=3)
-        ttk.Button(param_frame, text="Select Path", command=self.select_save_root).grid(row=4, column=3, padx=5, pady=3)
-        # 3. Operation area
-        op_frame = ttk.LabelFrame(main_frame, text="3. Start Processing", padding="10")
-        op_frame.pack(fill=tk.X, pady=5)
-        self.progress_var = tk.DoubleVar()
-        progress_bar = ttk.Progressbar(op_frame, variable=self.progress_var, maximum=100)
-        progress_bar.pack(fill=tk.X, padx=5, pady=5)
-        self.start_btn = ttk.Button(op_frame, text="Start Batch Processing", command=self.start_processing)
-        self.start_btn.pack(pady=5)
-        # 4. Log output area
-        log_frame = ttk.LabelFrame(main_frame, text="4. Processing Log", padding="10")
-        log_frame.pack(fill=tk.BOTH, expand=True, pady=5)
-        self.log_text = scrolledtext.ScrolledText(log_frame, wrap=tk.WORD, state=tk.DISABLED)
-        self.log_text.pack(fill=tk.BOTH, expand=True)
-        # Set log color tags
-        self.log_text.tag_config("red", foreground="red")
-        self.log_text.tag_config("green", foreground="green")
-        self.log_text.tag_config("blue", foreground="blue")
-        self.log_text.tag_config("orange", foreground="orange")
-        self.log_text.tag_config("gray", foreground="gray")
-        # 5. Bottom tip (AC50 related tip removed)
-        tip_label = ttk.Label(main_frame,
-                              text="Tip: Each endpoint returned by Dify generates a separate row in the result",
-                              foreground="gray")
-        tip_label.pack(side=tk.BOTTOM, pady=10)
-    def select_csv_file(self):
-        """Select CSV file"""
-        file_path = filedialog.askopenfilename(
-            title="Select Compound CSV File",
-            filetypes=[("CSV Files", "*.csv"), ("All Files", "*.*")]
-        )
-        if file_path:
-            self.csv_path_var.set(file_path)
-    def select_save_root(self):
-        """Select save path"""
-        folder_path = filedialog.askdirectory(title="Select Result Save Folder")
-        if folder_path:
-            self.save_root_var.set(folder_path)
-    def start_processing(self):
-        """Start batch processing (new thread to avoid UI freezing)"""
-        # Verify required parameters
-        csv_path = self.csv_path_var.get()
-        if not csv_path:
-            messagebox.showwarning("Warning", "Please select a CSV file first!")
-            return
-        api_key = self.api_key_var.get().strip()
-        if not api_key:
-            messagebox.showwarning("Warning", "Please fill in the Dify API Key!")
-            return
-        base_url = self.base_url_var.get().strip()
-        if not base_url:
-            messagebox.showwarning("Warning", "Please fill in the Dify URL!")
-            return
-        # Disable start button to prevent duplicate clicks
-        self.start_btn.config(state=tk.DISABLED)
-        # Clear log
-        self.log_text.config(state=tk.NORMAL)
-        self.log_text.delete(1.0, tk.END)
-        self.log_text.config(state=tk.DISABLED)
-        # New thread for processing (avoid UI freezing)
-        def process_thread():
-            try:
-                batch_process_compounds_gui(
-                    csv_path=csv_path,
-                    save_root=self.save_root_var.get(),
-                    api_key=api_key,
-                    base_url=base_url,
-                    log_text=self.log_text,
-                    progress_var=self.progress_var,
-                    compound_col=self.compound_col_var.get(),
-                    csv_encoding=self.csv_encoding_var.get(),
-                    csv_sep=self.csv_sep_var.get()
-                )
-            finally:
-                # Restore button state
-                self.start_btn.config(state=tk.NORMAL)
-        threading.Thread(target=process_thread, daemon=True).start()
-# ==================== Start Program ====================
-if __name__ == "__main__":
-    # Normal GUI startup (AC50 debug mode removed)
-    root = tk.Tk()
-    app = CompoundBatchToolGUI(root)
-    root.mainloop()