Spaces:

mgbam
/

CraAssitant

Runtime error

App Files Files Community

mgbam commited on Jan 20, 2025

Commit

31be05a

verified ·

1 Parent(s): 7e82038

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -127

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import json
 import csv
 import asyncio
@@ -24,7 +25,6 @@ import altair as alt
 import spacy
 import spacy.cli
 import PyPDF2
-import io  # For handling in-memory files (Excel, etc.)
 # Ensure spaCy model is downloaded
 try:
@@ -53,7 +53,7 @@ PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
 PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
 EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
-# Hugging Face login
 login(HUGGINGFACE_TOKEN)
 # Initialize OpenAI
@@ -94,7 +94,10 @@ LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
     "French to English": ("fr", "en"),
 }
-### Utility Functions ###
 def safe_json_parse(text: str) -> Union[Dict, None]:
     """Safely parse JSON string into a Python dictionary."""
     try:
@@ -131,7 +134,10 @@ def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
         })
     return articles
-### Async Functions for Europe PMC ###
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
     params = {"query": nct_id, "format": "json"}
     async with httpx.AsyncClient() as client_http:
@@ -158,7 +164,6 @@ async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
             logger.error(f"Error fetching articles: {e}")
             return {"error": str(e)}
-### PubMed Integration ###
 async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
@@ -194,7 +199,6 @@ async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
             logger.error(f"Error fetching PubMed articles: {e}")
             return {"error": str(e)}
-### Crossref Integration ###
 async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
@@ -209,7 +213,10 @@ async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
             logger.error(f"Error fetching Crossref data: {e}")
             return {"error": str(e)}
-### Core Functions ###
 def summarize_text(text: str) -> str:
     """Summarize text using OpenAI."""
     if not text.strip():
@@ -310,21 +317,19 @@ def perform_named_entity_recognition(text: str) -> str:
         logger.error(f"NER Error: {e}")
         return "Named Entity Recognition failed."
-### Enhanced EDA ###
 def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
     """
-    Perform a more advanced EDA given a DataFrame:
-      - Show dataset info (columns, shape, numeric summary).
-      - Generate a correlation heatmap (for numeric columns).
-      - Generate distribution plots (histograms) for numeric columns.
     Returns (text_summary, correlation_chart, distribution_chart).
     """
     try:
-        # Basic info
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
-        # Describe with include="all" to show all columns
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
@@ -334,7 +339,6 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
             f"Summary Statistics:\n{describe_info}\n"
         )
-        # Correlation heatmap (if at least 2 numeric columns)
         numeric_cols = df.select_dtypes(include="number")
         corr_chart = None
         if numeric_cols.shape[1] >= 2:
@@ -353,7 +357,6 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
                 .properties(width=400, height=400, title="Correlation Heatmap")
             )
-        # Distribution plots (histograms) for numeric columns
         distribution_chart = None
         if numeric_cols.shape[1] >= 1:
             df_long = numeric_cols.melt(var_name='Column', value_name='Value')
@@ -380,86 +383,66 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
         logger.error(f"Enhanced EDA Error: {e}")
         return f"Enhanced EDA failed: {e}", None, None
-### File Handling ###
-def read_uploaded_file(uploaded_file: Optional[gr.File]) -> str:
-    """
-    Reads the content of an uploaded file (txt, csv, xls, xlsx, pdf).
-    Returns the extracted text or CSV-like content for non-Excel files.
-    For Excel, we return a placeholder string; we'll handle it later.
-    """
-    if uploaded_file is None:
-        return ""
-    file_name = uploaded_file.name
-    file_ext = os.path.splitext(file_name)[1].lower()
     try:
-        # TXT
-        if file_ext == ".txt":
-            return uploaded_file.read().decode("utf-8")
-        # CSV
-        elif file_ext == ".csv":
-            return uploaded_file.read().decode("utf-8")
-        # Excel
-        elif file_ext in [".xls", ".xlsx"]:
-            # Return a placeholder so we know an Excel file was uploaded
-            return "EXCEL_FILE_PLACEHOLDER"
-        # PDF
-        elif file_ext == ".pdf":
-            pdf_reader = PyPDF2.PdfReader(uploaded_file)
-            text_content = []
-            for page in pdf_reader.pages:
-                text_content.append(page.extract_text())
-            return "\n".join(text_content)
-        else:
-            return f"Unsupported file format: {file_ext}"
     except Exception as e:
-        logger.error(f"File read error: {e}")
-        return f"Error reading file: {e}"
 def parse_excel_file(uploaded_file: gr.File) -> pd.DataFrame:
     """
     Parse an Excel file into a pandas DataFrame.
-    1) Try using the local file path, if it exists.
-    2) Otherwise, read from the in-memory object using uploaded_file.file.read().
     """
     import pandas as pd
-    # If we have a valid local file path (common in some Gradio versions)
-    if os.path.exists(uploaded_file.name):
-        # Directly read from the file path
-        return pd.read_excel(uploaded_file.name, engine="openpyxl")
-    # Otherwise, we read the file from memory
     try:
         excel_bytes = uploaded_file.file.read()
         return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
     except Exception as e:
-        logger.error(f"Excel parsing error: {e}")
-        raise ValueError(f"Excel parsing error: {e}")
-def parse_csv_content(csv_content: str) -> pd.DataFrame:
-    """
-    Attempt to parse CSV content with both utf-8 and utf-8-sig
-    to handle BOM issues or encoding complexities.
-    """
-    from io import StringIO
-    errors = []
-    for encoding_try in ["utf-8", "utf-8-sig"]:
-        try:
-            df = pd.read_csv(StringIO(csv_content), encoding=encoding_try)
-            return df
-        except Exception as e:
-            errors.append(f"Encoding {encoding_try} failed: {e}")
-    error_msg = "Could not parse CSV content.\n" + "\n".join(errors)
-    logger.error(error_msg)
-    raise ValueError(error_msg)
-### Gradio Interface ###
 with gr.Blocks() as demo:
     gr.Markdown("# ✨ Advanced Clinical Research Assistant with Enhanced EDA ✨")
     gr.Markdown("""
@@ -476,6 +459,7 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
     # Inputs
     with gr.Row():
         text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter clinical text or query...")
         file_input = gr.File(
             label="Upload File (txt/csv/xls/xlsx/pdf)",
             file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
@@ -515,16 +499,17 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
     # Outputs
     output_text = gr.Textbox(label="Output", lines=10)
     with gr.Row():
         output_chart = gr.Plot(label="Visualization 1")
         output_chart2 = gr.Plot(label="Visualization 2")
     output_file = gr.File(label="Generated File")
     submit_button = gr.Button("Submit")
-    # Async function for handling actions
     async def handle_action(
         action: str,
         text: str,
@@ -536,68 +521,85 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
         export_format: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        # 1) Read the uploaded file (if any) -> returns a string or placeholder
-        file_content = read_uploaded_file(file_up)
-        # 2) Combine user text with file text if needed
-        combined_text = (text + "\n" + file_content).strip() if file_content else text
-        ### Branch by action ###
         if action == "Summarize":
             return summarize_text(combined_text), None, None, None
         elif action == "Predict Outcome":
-            predictions = predict_outcome(combined_text)
-            if isinstance(predictions, dict):
-                chart = visualize_predictions(predictions)
-                return json.dumps(predictions, indent=2), chart, None, None
-            return predictions, None, None, None
         elif action == "Generate Report":
             file_path = generate_report(combined_text, filename=report_filename)
             msg = f"Report generated: {file_path}" if file_path else "Report generation failed."
             return msg, None, None, file_path
         elif action == "Translate":
-            return translate_text(combined_text, translation_opt), None, None, None
         elif action == "Perform Named Entity Recognition":
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
         elif action == "Perform Enhanced EDA":
-            # Ensure some data is provided
-            if not file_up and not combined_text:
-                return "No data provided for EDA.", None, None, None
-            # If the user uploaded an Excel file
-            if file_up and file_up.name.lower().endswith((".xls", ".xlsx")):
-                try:
-                    df_excel = parse_excel_file(file_up)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_excel)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"Excel EDA failed: {e}", None, None, None
-            # If the user uploaded a CSV
-            if file_up and file_up.name.lower().endswith(".csv"):
-                try:
-                    df_csv = parse_csv_content(file_content)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"CSV EDA failed: {e}", None, None, None
-            # If no file but possibly CSV text in the text box
-            if not file_up and "," in combined_text:
-                try:
-                    df_csv = parse_csv_content(combined_text)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"CSV EDA failed: {e}", None, None, None
-            return "No valid CSV/Excel data found for EDA.", None, None, None
         elif action == "Fetch Clinical Studies":
             if nct_id:
@@ -642,9 +644,69 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
             )
             return formatted, None, None, None
-        # Default fallback
         return "Invalid action.", None, None, None
     submit_button.click(
         handle_action,
         inputs=[
@@ -657,8 +719,12 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
             report_filename_input,
             export_format,
         ],
-        outputs=[output_text, output_chart, output_chart2, output_file],
     )
-# Launch the Gradio app
 demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import os
+import io
 import json
 import csv
 import asyncio
 import spacy
 import spacy.cli
 import PyPDF2
 # Ensure spaCy model is downloaded
 try:
 PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
 EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+# Log in to Hugging Face
 login(HUGGINGFACE_TOKEN)
 # Initialize OpenAI
     "French to English": ("fr", "en"),
 }
+###################################################
+#                     UTILS                       #
+###################################################
 def safe_json_parse(text: str) -> Union[Dict, None]:
     """Safely parse JSON string into a Python dictionary."""
     try:
         })
     return articles
+###################################################
+#                ASYNC FETCHES                    #
+###################################################
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
     params = {"query": nct_id, "format": "json"}
     async with httpx.AsyncClient() as client_http:
             logger.error(f"Error fetching articles: {e}")
             return {"error": str(e)}
 async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
             logger.error(f"Error fetching PubMed articles: {e}")
             return {"error": str(e)}
 async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
             logger.error(f"Error fetching Crossref data: {e}")
             return {"error": str(e)}
+###################################################
+#                   CORE LOGIC                    #
+###################################################
 def summarize_text(text: str) -> str:
     """Summarize text using OpenAI."""
     if not text.strip():
         logger.error(f"NER Error: {e}")
         return "Named Entity Recognition failed."
+###################################################
+#                ENHANCED EDA                     #
+###################################################
 def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
     """
+    Show columns, shape, numeric summary, correlation heatmap, and distribution histograms.
     Returns (text_summary, correlation_chart, distribution_chart).
     """
     try:
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
             f"Summary Statistics:\n{describe_info}\n"
         )
         numeric_cols = df.select_dtypes(include="number")
         corr_chart = None
         if numeric_cols.shape[1] >= 2:
                 .properties(width=400, height=400, title="Correlation Heatmap")
             )
         distribution_chart = None
         if numeric_cols.shape[1] >= 1:
             df_long = numeric_cols.melt(var_name='Column', value_name='Value')
         logger.error(f"Enhanced EDA Error: {e}")
         return f"Enhanced EDA failed: {e}", None, None
+###################################################
+#                FILE PARSING                     #
+###################################################
+def parse_text_file(uploaded_file: gr.File) -> str:
+    """Reads a .txt file as UTF-8 text."""
+    return uploaded_file.read().decode("utf-8")
+def parse_csv_file(uploaded_file: gr.File) -> pd.DataFrame:
+    """
+    Reads CSV content with possible BOM issues
+    by trying 'utf-8' and 'utf-8-sig'.
+    """
+    content = uploaded_file.read().decode("utf-8", errors="replace")
+    # We can attempt to parse with multiple encodings if needed:
+    # For simplicity, let's just do a fallback approach:
     try:
+        from io import StringIO
+        df = pd.read_csv(StringIO(content))
+        return df
     except Exception as e:
+        raise ValueError(f"CSV parse error: {e}")
 def parse_excel_file(uploaded_file: gr.File) -> pd.DataFrame:
     """
     Parse an Excel file into a pandas DataFrame.
+    1) If the path exists, read directly from path.
+    2) Else read from uploaded_file.file (in-memory) in binary mode.
     """
     import pandas as pd
+    import os
+    excel_path = uploaded_file.name
+    # Try local path first
+    if os.path.isfile(excel_path):
+        return pd.read_excel(excel_path, engine="openpyxl")
+    # Fall back to reading raw bytes from uploaded_file.file
     try:
         excel_bytes = uploaded_file.file.read()
         return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
     except Exception as e:
+        raise ValueError(f"Excel parse error: {e}")
+def parse_pdf_file(uploaded_file: gr.File) -> str:
+    """Reads a PDF file with PyPDF2, extracting text from each page."""
+    try:
+        pdf_reader = PyPDF2.PdfReader(uploaded_file)
+        text_content = []
+        for page in pdf_reader.pages:
+            text_content.append(page.extract_text())
+        return "\n".join(text_content)
+    except Exception as e:
+        logger.error(f"PDF parse error: {e}")
+        return f"Error reading PDF file: {e}"
+###################################################
+#             GRADIO INTERFACE                    #
+###################################################
 with gr.Blocks() as demo:
     gr.Markdown("# ✨ Advanced Clinical Research Assistant with Enhanced EDA ✨")
     gr.Markdown("""
     # Inputs
     with gr.Row():
         text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter clinical text or query...")
+        # We'll rely on .name and .file for the path and file handle
         file_input = gr.File(
             label="Upload File (txt/csv/xls/xlsx/pdf)",
             file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
     # Outputs
     output_text = gr.Textbox(label="Output", lines=10)
     with gr.Row():
         output_chart = gr.Plot(label="Visualization 1")
         output_chart2 = gr.Plot(label="Visualization 2")
     output_file = gr.File(label="Generated File")
     submit_button = gr.Button("Submit")
+    ################################################################
+    #                    MAIN HANDLER FUNCTION                     #
+    ################################################################
     async def handle_action(
         action: str,
         text: str,
         export_format: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
+        # 1) Start with user-provided text
+        combined_text = text.strip()
+        # 2) If user uploaded a file, parse it based on extension
+        if file_up is not None:
+            file_ext = os.path.splitext(file_up.name)[1].lower()
+            if file_ext == ".txt":
+                file_text = parse_text_file(file_up)
+                combined_text = (combined_text + "\n" + file_text).strip()
+            elif file_ext == ".csv":
+                # If user chose EDA, we'll parse into DataFrame below
+                # If we just want to combine text for Summarize, etc., do so:
+                pass
+            elif file_ext in [".xls", ".xlsx"]:
+                # We'll handle Excel parsing in the EDA step if needed
+                pass
+            elif file_ext == ".pdf":
+                file_text = parse_pdf_file(file_up)
+                combined_text = (combined_text + "\n" + file_text).strip()
+        ### ACTIONS ###
         if action == "Summarize":
+            if file_up and file_up.name.endswith(".csv"):
+                # Merge CSV text into combined_text
+                # in case user wants summarization of the CSV's raw text
+                try:
+                    df_csv = parse_csv_file(file_up)
+                    # Turn CSV into text
+                    csv_as_text = df_csv.to_csv(index=False)
+                    combined_text = (combined_text + "\n" + csv_as_text).strip()
+                except Exception as e:
+                    return f"CSV parse error for Summarize: {e}", None, None, None
+            # Summarize the combined text
             return summarize_text(combined_text), None, None, None
         elif action == "Predict Outcome":
+            return _action_predict_outcome(combined_text, file_up)
         elif action == "Generate Report":
+            # Add CSV content if needed
+            if file_up and file_up.name.endswith(".csv"):
+                try:
+                    df_csv = parse_csv_file(file_up)
+                    combined_text += "\n" + df_csv.to_csv(index=False)
+                except Exception as e:
+                    logger.error(f"Error reading CSV for report: {e}")
             file_path = generate_report(combined_text, filename=report_filename)
             msg = f"Report generated: {file_path}" if file_path else "Report generation failed."
             return msg, None, None, file_path
         elif action == "Translate":
+            # Optionally read CSV or PDF text?
+            if file_up and file_up.name.endswith(".csv"):
+                try:
+                    df_csv = parse_csv_file(file_up)
+                    combined_text += "\n" + df_csv.to_csv(index=False)
+                except Exception as e:
+                    return f"CSV parse error for Translate: {e}", None, None, None
+            translated = translate_text(combined_text, translation_opt)
+            return translated, None, None, None
         elif action == "Perform Named Entity Recognition":
+            # Merge CSV as text if user wants NER on CSV
+            if file_up and file_up.name.endswith(".csv"):
+                try:
+                    df_csv = parse_csv_file(file_up)
+                    combined_text += "\n" + df_csv.to_csv(index=False)
+                except Exception as e:
+                    return f"CSV parse error for NER: {e}", None, None, None
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
         elif action == "Perform Enhanced EDA":
+            return await _action_eda(combined_text, file_up, text)
         elif action == "Fetch Clinical Studies":
             if nct_id:
             )
             return formatted, None, None, None
         return "Invalid action.", None, None, None
+    def _action_predict_outcome(combined_text: str, file_up: gr.File) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
+        # If CSV is uploaded, we can merge it into text or do separate logic
+        if file_up and file_up.name.endswith(".csv"):
+            try:
+                df_csv = parse_csv_file(file_up)
+                # Optionally, merge CSV content into the text to be classified
+                combined_text_local = combined_text + "\n" + df_csv.to_csv(index=False)
+            except Exception as e:
+                return f"CSV parse error for Predict Outcome: {e}", None, None, None
+        else:
+            combined_text_local = combined_text
+        predictions = predict_outcome(combined_text_local)
+        if isinstance(predictions, dict):
+            chart = visualize_predictions(predictions)
+            return json.dumps(predictions, indent=2), chart, None, None
+        return predictions, None, None, None
+    async def _action_eda(combined_text: str, file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
+        """
+        Perform Enhanced EDA on a CSV or Excel file if uploaded.
+        If .csv is present, parse as CSV; if .xls/.xlsx is present, parse as Excel.
+        """
+        # Make sure we either have a file or some data in the text
+        if not file_up and not raw_text.strip():
+            return "No data provided for EDA.", None, None, None
+        if file_up:
+            file_ext = os.path.splitext(file_up.name)[1].lower()
+            if file_ext == ".csv":
+                try:
+                    df_csv = parse_csv_file(file_up)
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
+                    return eda_summary, corr_chart, dist_chart, None
+                except Exception as e:
+                    return f"CSV EDA failed: {e}", None, None, None
+            elif file_ext in [".xls", ".xlsx"]:
+                try:
+                    df_excel = parse_excel_file(file_up)
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_excel)
+                    return eda_summary, corr_chart, dist_chart, None
+                except Exception as e:
+                    return f"Excel EDA failed: {e}", None, None, None
+            else:
+                # EDA not supported for PDF or .txt in this example
+                return "No valid CSV/Excel data found for EDA.", None, None, None
+        else:
+            # If no file, maybe the user pasted CSV into the text box
+            if "," in raw_text:
+                # Attempt to parse text as CSV
+                try:
+                    from io import StringIO
+                    df_csv = pd.read_csv(StringIO(raw_text))
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
+                    return eda_summary, corr_chart, dist_chart, None
+                except Exception as e:
+                    return f"EDA parse error for pasted CSV: {e}", None, None, None
+            return "No valid CSV/Excel data found for EDA.", None, None, None
     submit_button.click(
         handle_action,
         inputs=[
             report_filename_input,
             export_format,
         ],
+        outputs=[
+            output_text,
+            output_chart,
+            output_chart2,
+            output_file,
+        ],
     )
 demo.launch(server_name="0.0.0.0", server_port=7860, share=True)