Spaces:

mgbam
/

CraAssitant

Runtime error

App Files Files Community

mgbam commited on Jan 20, 2025

Commit

d3ccae5

verified ·

1 Parent(s): 31be05a

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -240

app.py CHANGED Viewed

@@ -43,27 +43,18 @@ HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
-# Basic checks
 if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
     logger.error("Missing Hugging Face or OpenAI credentials.")
     raise ValueError("Missing credentials for Hugging Face or OpenAI.")
-# API endpoints
-PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
-PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
-# Log in to Hugging Face
 login(HUGGINGFACE_TOKEN)
-# Initialize OpenAI
 client = OpenAI(api_key=OPENAI_API_KEY)
-# Device setting
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
-# Model settings
 MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
 try:
     model = AutoModelForSequenceClassification.from_pretrained(
@@ -76,7 +67,7 @@ except Exception as e:
     logger.error(f"Model load error: {e}")
     raise
-# Translation model settings
 try:
     translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
     translation_model = MarianMTModel.from_pretrained(
@@ -94,12 +85,16 @@ LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
     "French to English": ("fr", "en"),
 }
-###################################################
-#                     UTILS                       #
-###################################################
 def safe_json_parse(text: str) -> Union[Dict, None]:
-    """Safely parse JSON string into a Python dictionary."""
     try:
         return json.loads(text)
     except json.JSONDecodeError as e:
@@ -107,7 +102,7 @@ def safe_json_parse(text: str) -> Union[Dict, None]:
         return None
 def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
-    """Parses PubMed XML data and returns a list of structured articles."""
     root = ET.fromstring(xml_data)
     articles = []
     for article in root.findall(".//PubmedArticle"):
@@ -134,9 +129,9 @@ def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
         })
     return articles
-###################################################
-#                ASYNC FETCHES                    #
-###################################################
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
     params = {"query": nct_id, "format": "json"}
@@ -213,12 +208,11 @@ async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
             logger.error(f"Error fetching Crossref data: {e}")
             return {"error": str(e)}
-###################################################
-#                   CORE LOGIC                    #
-###################################################
 def summarize_text(text: str) -> str:
-    """Summarize text using OpenAI."""
     if not text.strip():
         return "No text provided for summarization."
     try:
@@ -234,7 +228,6 @@ def summarize_text(text: str) -> str:
         return "Summarization failed."
 def predict_outcome(text: str) -> Union[Dict[str, float], str]:
-    """Predict outcomes (classification) using a fine-tuned model."""
     if not text.strip():
         return "No text provided for prediction."
     try:
@@ -249,7 +242,6 @@ def predict_outcome(text: str) -> Union[Dict[str, float], str]:
         return "Prediction failed."
 def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optional[str]:
-    """Generate a PDF report from the given text."""
     try:
         if not text.strip():
             logger.warning("No text provided for the report.")
@@ -271,7 +263,6 @@ def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optiona
         return None
 def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
-    """Visualize model prediction probabilities using Altair."""
     try:
         data = pd.DataFrame(list(predictions.items()), columns=["Label", "Probability"])
         chart = (
@@ -290,7 +281,6 @@ def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
         return None
 def translate_text(text: str, translation_option: str) -> str:
-    """Translate text between English and French."""
     if not text.strip():
         return "No text provided for translation."
     try:
@@ -304,7 +294,6 @@ def translate_text(text: str, translation_option: str) -> str:
         return "Translation failed."
 def perform_named_entity_recognition(text: str) -> str:
-    """Perform Named Entity Recognition (NER) using spaCy."""
     if not text.strip():
         return "No text provided for NER."
     try:
@@ -317,19 +306,15 @@ def perform_named_entity_recognition(text: str) -> str:
         logger.error(f"NER Error: {e}")
         return "Named Entity Recognition failed."
-###################################################
-#                ENHANCED EDA                     #
-###################################################
 def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
-    """
-    Show columns, shape, numeric summary, correlation heatmap, and distribution histograms.
-    Returns (text_summary, correlation_chart, distribution_chart).
-    """
     try:
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
@@ -340,7 +325,9 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
         )
         numeric_cols = df.select_dtypes(include="number")
-        corr_chart = None
         if numeric_cols.shape[1] >= 2:
             corr = numeric_cols.corr()
             corr_melted = corr.reset_index().melt(id_vars="index")
@@ -357,7 +344,7 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
                 .properties(width=400, height=400, title="Correlation Heatmap")
             )
-        distribution_chart = None
         if numeric_cols.shape[1] >= 1:
             df_long = numeric_cols.melt(var_name='Column', value_name='Value')
             distribution_chart = (
@@ -383,83 +370,108 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
         logger.error(f"Enhanced EDA Error: {e}")
         return f"Enhanced EDA failed: {e}", None, None
-###################################################
-#                FILE PARSING                     #
-###################################################
-def parse_text_file(uploaded_file: gr.File) -> str:
-    """Reads a .txt file as UTF-8 text."""
-    return uploaded_file.read().decode("utf-8")
-def parse_csv_file(uploaded_file: gr.File) -> pd.DataFrame:
     """
-    Reads CSV content with possible BOM issues
-    by trying 'utf-8' and 'utf-8-sig'.
     """
-    content = uploaded_file.read().decode("utf-8", errors="replace")
-    # We can attempt to parse with multiple encodings if needed:
-    # For simplicity, let's just do a fallback approach:
-    try:
-        from io import StringIO
-        df = pd.read_csv(StringIO(content))
-        return df
-    except Exception as e:
-        raise ValueError(f"CSV parse error: {e}")
-def parse_excel_file(uploaded_file: gr.File) -> pd.DataFrame:
     """
-    Parse an Excel file into a pandas DataFrame.
-    1) If the path exists, read directly from path.
-    2) Else read from uploaded_file.file (in-memory) in binary mode.
     """
-    import pandas as pd
     import os
-    excel_path = uploaded_file.name
-    # Try local path first
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
-    # Fall back to reading raw bytes from uploaded_file.file
-    try:
-        excel_bytes = uploaded_file.file.read()
-        return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
-    except Exception as e:
-        raise ValueError(f"Excel parse error: {e}")
-def parse_pdf_file(uploaded_file: gr.File) -> str:
-    """Reads a PDF file with PyPDF2, extracting text from each page."""
-    try:
-        pdf_reader = PyPDF2.PdfReader(uploaded_file)
-        text_content = []
-        for page in pdf_reader.pages:
-            text_content.append(page.extract_text())
-        return "\n".join(text_content)
-    except Exception as e:
-        logger.error(f"PDF parse error: {e}")
-        return f"Error reading PDF file: {e}"
-###################################################
-#             GRADIO INTERFACE                    #
-###################################################
 with gr.Blocks() as demo:
-    gr.Markdown("# ✨ Advanced Clinical Research Assistant with Enhanced EDA ✨")
     gr.Markdown("""
-Welcome to the **Enhanced** AI-Powered Clinical Assistant!
-- **Summarize** large blocks of clinical text.
-- **Predict** outcomes with a fine-tuned model.
-- **Translate** text (English ↔ French).
-- **Perform Named Entity Recognition** (spaCy).
-- **Fetch** from PubMed, Crossref, Europe PMC.
-- **Generate** professional PDF reports.
-- **Perform Enhanced EDA** on CSV/Excel data (correlation heatmaps + distribution plots).
 """)
-    # Inputs
     with gr.Row():
-        text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter clinical text or query...")
-        # We'll rely on .name and .file for the path and file handle
         file_input = gr.File(
             label="Upload File (txt/csv/xls/xlsx/pdf)",
             file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
@@ -485,127 +497,167 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
         label="Translation Option",
         value="English to French"
     )
-    query_params_input = gr.Textbox(
-        label="Query Parameters (JSON Format)",
-        placeholder='{"term": "cancer", "retmax": "5"}'
-    )
-    nct_id_input = gr.Textbox(label="NCT ID for Article Search")
-    report_filename_input = gr.Textbox(
-        label="Report Filename",
-        placeholder="clinical_report.pdf",
-        value="clinical_report.pdf"
-    )
-    export_format = gr.Dropdown(["None", "CSV", "JSON"], label="Export Format")
-    # Outputs
-    output_text = gr.Textbox(label="Output", lines=10)
     with gr.Row():
-        output_chart = gr.Plot(label="Visualization 1")
-        output_chart2 = gr.Plot(label="Visualization 2")
     output_file = gr.File(label="Generated File")
-    submit_button = gr.Button("Submit")
     ################################################################
-    #                    MAIN HANDLER FUNCTION                     #
     ################################################################
     async def handle_action(
         action: str,
-        text: str,
         file_up: gr.File,
         translation_opt: str,
-        query_params: str,
         nct_id: str,
-        report_filename: str,
-        export_format: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        # 1) Start with user-provided text
-        combined_text = text.strip()
-        # 2) If user uploaded a file, parse it based on extension
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
-            if file_ext == ".txt":
-                file_text = parse_text_file(file_up)
-                combined_text = (combined_text + "\n" + file_text).strip()
-            elif file_ext == ".csv":
-                # If user chose EDA, we'll parse into DataFrame below
-                # If we just want to combine text for Summarize, etc., do so:
-                pass
-            elif file_ext in [".xls", ".xlsx"]:
-                # We'll handle Excel parsing in the EDA step if needed
-                pass
             elif file_ext == ".pdf":
-                file_text = parse_pdf_file(file_up)
-                combined_text = (combined_text + "\n" + file_text).strip()
-        ### ACTIONS ###
-        if action == "Summarize":
-            if file_up and file_up.name.endswith(".csv"):
-                # Merge CSV text into combined_text
-                # in case user wants summarization of the CSV's raw text
                 try:
-                    df_csv = parse_csv_file(file_up)
-                    # Turn CSV into text
-                    csv_as_text = df_csv.to_csv(index=False)
-                    combined_text = (combined_text + "\n" + csv_as_text).strip()
                 except Exception as e:
-                    return f"CSV parse error for Summarize: {e}", None, None, None
-            # Summarize the combined text
-            return summarize_text(combined_text), None, None, None
         elif action == "Predict Outcome":
-            return _action_predict_outcome(combined_text, file_up)
         elif action == "Generate Report":
-            # Add CSV content if needed
-            if file_up and file_up.name.endswith(".csv"):
-                try:
-                    df_csv = parse_csv_file(file_up)
-                    combined_text += "\n" + df_csv.to_csv(index=False)
-                except Exception as e:
-                    logger.error(f"Error reading CSV for report: {e}")
-            file_path = generate_report(combined_text, filename=report_filename)
-            msg = f"Report generated: {file_path}" if file_path else "Report generation failed."
-            return msg, None, None, file_path
         elif action == "Translate":
-            # Optionally read CSV or PDF text?
-            if file_up and file_up.name.endswith(".csv"):
-                try:
-                    df_csv = parse_csv_file(file_up)
-                    combined_text += "\n" + df_csv.to_csv(index=False)
-                except Exception as e:
-                    return f"CSV parse error for Translate: {e}", None, None, None
             translated = translate_text(combined_text, translation_opt)
             return translated, None, None, None
         elif action == "Perform Named Entity Recognition":
-            # Merge CSV as text if user wants NER on CSV
-            if file_up and file_up.name.endswith(".csv"):
-                try:
-                    df_csv = parse_csv_file(file_up)
-                    combined_text += "\n" + df_csv.to_csv(index=False)
-                except Exception as e:
-                    return f"CSV parse error for NER: {e}", None, None, None
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
         elif action == "Perform Enhanced EDA":
-            return await _action_eda(combined_text, file_up, text)
         elif action == "Fetch Clinical Studies":
             if nct_id:
                 result = await fetch_articles_by_nct_id(nct_id)
-            elif query_params:
-                result = await fetch_articles_by_query(query_params)
             else:
                 return "Provide either an NCT ID or valid query parameters.", None, None, None
@@ -620,7 +672,7 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
             return formatted_results, None, None, None
         elif action in ["Fetch PubMed Articles (Legacy)", "Fetch PubMed by Query"]:
-            pubmed_result = await fetch_pubmed_by_query(query_params)
             xml_data = pubmed_result.get("result")
             if xml_data:
                 articles = parse_pubmed_xml(xml_data)
@@ -634,7 +686,7 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
             return "No articles found or error fetching data.", None, None, None
         elif action == "Fetch Crossref by Query":
-            crossref_result = await fetch_crossref_by_query(query_params)
             items = crossref_result.get("message", {}).get("items", [])
             if not items:
                 return "No results found.", None, None, None
@@ -645,86 +697,49 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
             return formatted, None, None, None
         return "Invalid action.", None, None, None
-    def _action_predict_outcome(combined_text: str, file_up: gr.File) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        # If CSV is uploaded, we can merge it into text or do separate logic
-        if file_up and file_up.name.endswith(".csv"):
-            try:
-                df_csv = parse_csv_file(file_up)
-                # Optionally, merge CSV content into the text to be classified
-                combined_text_local = combined_text + "\n" + df_csv.to_csv(index=False)
-            except Exception as e:
-                return f"CSV parse error for Predict Outcome: {e}", None, None, None
-        else:
-            combined_text_local = combined_text
-        predictions = predict_outcome(combined_text_local)
-        if isinstance(predictions, dict):
-            chart = visualize_predictions(predictions)
-            return json.dumps(predictions, indent=2), chart, None, None
-        return predictions, None, None, None
-    async def _action_eda(combined_text: str, file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
         """
-        Perform Enhanced EDA on a CSV or Excel file if uploaded.
-        If .csv is present, parse as CSV; if .xls/.xlsx is present, parse as Excel.
         """
-        # Make sure we either have a file or some data in the text
-        if not file_up and not raw_text.strip():
             return "No data provided for EDA.", None, None, None
-        if file_up:
-            file_ext = os.path.splitext(file_up.name)[1].lower()
-            if file_ext == ".csv":
                 try:
-                    df_csv = parse_csv_file(file_up)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
                     return f"CSV EDA failed: {e}", None, None, None
-            elif file_ext in [".xls", ".xlsx"]:
                 try:
-                    df_excel = parse_excel_file(file_up)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_excel)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
                     return f"Excel EDA failed: {e}", None, None, None
             else:
-                # EDA not supported for PDF or .txt in this example
-                return "No valid CSV/Excel data found for EDA.", None, None, None
         else:
-            # If no file, maybe the user pasted CSV into the text box
             if "," in raw_text:
-                # Attempt to parse text as CSV
                 try:
-                    from io import StringIO
-                    df_csv = pd.read_csv(StringIO(raw_text))
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
-                    return f"EDA parse error for pasted CSV: {e}", None, None, None
             return "No valid CSV/Excel data found for EDA.", None, None, None
-    submit_button.click(
-        handle_action,
-        inputs=[
-            action,
-            text_input,
-            file_input,
-            translation_option,
-            query_params_input,
-            nct_id_input,
-            report_filename_input,
-            export_format,
-        ],
-        outputs=[
-            output_text,
-            output_chart,
-            output_chart2,
-            output_file,
-        ],
     )
 demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
 if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
     logger.error("Missing Hugging Face or OpenAI credentials.")
     raise ValueError("Missing credentials for Hugging Face or OpenAI.")
+# Hugging Face & OpenAI
 login(HUGGINGFACE_TOKEN)
 client = OpenAI(api_key=OPENAI_API_KEY)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
+# Model: Classification
 MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
 try:
     model = AutoModelForSequenceClassification.from_pretrained(
     logger.error(f"Model load error: {e}")
     raise
+# Model: Translation
 try:
     translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
     translation_model = MarianMTModel.from_pretrained(
     "French to English": ("fr", "en"),
 }
+# API endpoints
+PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+##########################################################
+#                  HELPER FUNCTIONS                      #
+##########################################################
 def safe_json_parse(text: str) -> Union[Dict, None]:
     try:
         return json.loads(text)
     except json.JSONDecodeError as e:
         return None
 def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
+    """Parse PubMed XML and return structured articles."""
     root = ET.fromstring(xml_data)
     articles = []
     for article in root.findall(".//PubmedArticle"):
         })
     return articles
+##########################################################
+#                 ASYNC FETCH FUNCTIONS                  #
+##########################################################
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
     params = {"query": nct_id, "format": "json"}
             logger.error(f"Error fetching Crossref data: {e}")
             return {"error": str(e)}
+##########################################################
+#                     CORE FUNCTIONS                     #
+##########################################################
 def summarize_text(text: str) -> str:
     if not text.strip():
         return "No text provided for summarization."
     try:
         return "Summarization failed."
 def predict_outcome(text: str) -> Union[Dict[str, float], str]:
     if not text.strip():
         return "No text provided for prediction."
     try:
         return "Prediction failed."
 def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optional[str]:
     try:
         if not text.strip():
             logger.warning("No text provided for the report.")
         return None
 def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
     try:
         data = pd.DataFrame(list(predictions.items()), columns=["Label", "Probability"])
         chart = (
         return None
 def translate_text(text: str, translation_option: str) -> str:
     if not text.strip():
         return "No text provided for translation."
     try:
         return "Translation failed."
 def perform_named_entity_recognition(text: str) -> str:
     if not text.strip():
         return "No text provided for NER."
     try:
         logger.error(f"NER Error: {e}")
         return "Named Entity Recognition failed."
+##########################################################
+#                 ENHANCED EDA FUNCTIONS                 #
+##########################################################
 def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
+    """Show columns, shape, numeric summary, correlation heatmap, distribution histograms."""
     try:
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
         )
         numeric_cols = df.select_dtypes(include="number")
+        corr_chart, distribution_chart = None, None
+        # Correlation
         if numeric_cols.shape[1] >= 2:
             corr = numeric_cols.corr()
             corr_melted = corr.reset_index().melt(id_vars="index")
                 .properties(width=400, height=400, title="Correlation Heatmap")
             )
+        # Distribution
         if numeric_cols.shape[1] >= 1:
             df_long = numeric_cols.melt(var_name='Column', value_name='Value')
             distribution_chart = (
         logger.error(f"Enhanced EDA Error: {e}")
         return f"Enhanced EDA failed: {e}", None, None
+##########################################################
+#         PARSING FILES WITHOUT .read() ERRORS           #
+##########################################################
+def parse_text_file_as_str(file_up: gr.File) -> str:
+    """
+    For .txt or .pdf, read them manually.
+    (We'll do PDF in a separate function.)
+    """
+    # If user has older Gradio that doesn't store .file or .read()
+    # let's do the same approach as CSV:
+    return _read_file_contents(file_up)
+def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
+    Safely parse a CSV with fallback approach:
+      1) If file path exists, read from disk.
+      2) Else read from uploaded_file.file in memory.
+    Then parse with pandas.
     """
+    raw_text = _read_file_contents(file_up)
+    # Parse with pandas
+    from io import StringIO
+    df = pd.read_csv(StringIO(raw_text))
+    return df
+def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
+    For .xls or .xlsx:
+      1) If file path exists, read from that path.
+      2) Else read from .file in memory.
     """
     import os
+    excel_path = file_up.name
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
+    else:
+        try:
+            raw_bytes = file_up.file.read()  # fallback approach
+            return pd.read_excel(io.BytesIO(raw_bytes), engine="openpyxl")
+        except Exception as e:
+            raise ValueError(f"Excel parse error: {e}")
+def parse_pdf_file_as_str(file_up: gr.File) -> str:
+    """
+    For PDFs, read pages with PyPDF2.
+    """
+    import os
+    pdf_path = file_up.name
+    # If the path is real
+    if os.path.isfile(pdf_path):
+        with open(pdf_path, "rb") as f:
+            pdf_reader = PyPDF2.PdfReader(f)
+            text_content = []
+            for page in pdf_reader.pages:
+                text_content.append(page.extract_text() or "")
+            return "\n".join(text_content)
+    else:
+        # Fallback read from memory
+        try:
+            pdf_bytes = file_up.file.read()
+            reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+            text_content = []
+            for page in reader.pages:
+                text_content.append(page.extract_text() or "")
+            return "\n".join(text_content)
+        except Exception as e:
+            raise ValueError(f"PDF parse error: {e}")
+def _read_file_contents(file_up: gr.File, encoding="utf-8") -> str:
+    """
+    Generic fallback approach for .txt or .csv:
+      1) If file path is real, read from disk.
+      2) Else read from file_up.file in memory.
+    """
+    import os
+    path = file_up.name
+    if os.path.isfile(path):
+        with open(path, "rb") as f:
+            return f.read().decode(encoding, errors="replace")
+    else:
+        # fallback
+        return file_up.file.read().decode(encoding, errors="replace")
+##########################################################
+#                   GRADIO APP SETUP                     #
+##########################################################
 with gr.Blocks() as demo:
+    gr.Markdown("# 🩺 Enhanced Clinical Research Assistant with EDA")
     gr.Markdown("""
+- **Summarize** text (GPT-3.5)
+- **Predict** outcomes (fine-tuned model)
+- **Translate** (English ↔ French)
+- **Named Entity Recognition** (spaCy)
+- **Fetch** from PubMed, Crossref, Europe PMC
+- **Generate** PDF reports
+- **Enhanced EDA** on CSV/Excel (correlation, distributions)
 """)
     with gr.Row():
+        text_input = gr.Textbox(label="Input Text", lines=5)
         file_input = gr.File(
             label="Upload File (txt/csv/xls/xlsx/pdf)",
             file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
         label="Translation Option",
         value="English to French"
     )
+    query_params_input = gr.Textbox(label="Query Params (JSON)", placeholder='{"term": "cancer"}')
+    nct_id_input = gr.Textbox(label="NCT ID")
+    report_filename_input = gr.Textbox(label="Report Filename", value="clinical_report.pdf")
+    export_format = gr.Dropdown(choices=["None", "CSV", "JSON"], label="Export Format")
+    output_text = gr.Textbox(label="Output", lines=8)
     with gr.Row():
+        output_chart = gr.Plot(label="Chart 1")
+        output_chart2 = gr.Plot(label="Chart 2")
     output_file = gr.File(label="Generated File")
+    submit_btn = gr.Button("Submit")
     ################################################################
+    #                    MAIN ACTION HANDLER                       #
     ################################################################
     async def handle_action(
         action: str,
+        txt: str,
         file_up: gr.File,
         translation_opt: str,
+        query_str: str,
         nct_id: str,
+        report_fn: str,
+        exp_fmt: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
+        # Start with user text
+        combined_text = txt.strip()
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
+            # For Summaries, NER, etc. we'll just append the file text to 'combined_text'
+            # For EDA, we'll parse into a DataFrame
+            # Let's do minimal logic here, then handle in each action block.
+            if file_ext == ".txt":
+                file_text = _read_file_contents(file_up)
+                combined_text += "\n" + file_text
             elif file_ext == ".pdf":
                 try:
+                    pdf_text = parse_pdf_file_as_str(file_up)
+                    combined_text += "\n" + pdf_text
                 except Exception as e:
+                    return f"PDF parse error: {e}", None, None, None
+        # Now handle each action:
+        if action == "Summarize":
+            # If user uploaded CSV or Excel, optionally parse it into text
+            if file_up:
+                fx = file_up.name.lower()
+                if fx.endswith(".csv"):
+                    try:
+                        df_csv = parse_csv_file_to_df(file_up)
+                        csv_as_text = df_csv.to_csv(index=False)
+                        combined_text += "\n" + csv_as_text
+                    except Exception as e:
+                        return f"CSV parse error for Summarize: {e}", None, None, None
+                elif fx.endswith((".xls", ".xlsx")):
+                    try:
+                        df_xl = parse_excel_file_to_df(file_up)
+                        excel_as_text = df_xl.to_csv(index=False)
+                        combined_text += "\n" + excel_as_text
+                    except Exception as e:
+                        return f"Excel parse error for Summarize: {e}", None, None, None
+            summary = summarize_text(combined_text)
+            return summary, None, None, None
         elif action == "Predict Outcome":
+            # Optionally parse CSV/Excel into text
+            if file_up:
+                fx = file_up.name.lower()
+                if fx.endswith(".csv"):
+                    try:
+                        df_csv = parse_csv_file_to_df(file_up)
+                        combined_text += "\n" + df_csv.to_csv(index=False)
+                    except Exception as e:
+                        return f"CSV parse error: {e}", None, None, None
+                elif fx.endswith((".xls", ".xlsx")):
+                    try:
+                        df_xl = parse_excel_file_to_df(file_up)
+                        combined_text += "\n" + df_xl.to_csv(index=False)
+                    except Exception as e:
+                        return f"Excel parse error: {e}", None, None, None
+            predictions = predict_outcome(combined_text)
+            if isinstance(predictions, dict):
+                chart = visualize_predictions(predictions)
+                return json.dumps(predictions, indent=2), chart, None, None
+            return predictions, None, None, None
         elif action == "Generate Report":
+            # Merge CSV/Excel if needed
+            if file_up:
+                fx = file_up.name.lower()
+                if fx.endswith(".csv"):
+                    try:
+                        df_csv = parse_csv_file_to_df(file_up)
+                        combined_text += "\n" + df_csv.to_csv(index=False)
+                    except Exception as e:
+                        return f"CSV parse error for Report: {e}", None, None, None
+                elif fx.endswith((".xls", ".xlsx")):
+                    try:
+                        df_xl = parse_excel_file_to_df(file_up)
+                        combined_text += "\n" + df_xl.to_csv(index=False)
+                    except Exception as e:
+                        return f"Excel parse error for Report: {e}", None, None, None
+            fp = generate_report(combined_text, report_fn)
+            msg = f"Report generated: {fp}" if fp else "Report generation failed."
+            return msg, None, None, fp
         elif action == "Translate":
+            if file_up:
+                fx = file_up.name.lower()
+                if fx.endswith(".csv"):
+                    try:
+                        df_csv = parse_csv_file_to_df(file_up)
+                        combined_text += "\n" + df_csv.to_csv(index=False)
+                    except Exception as e:
+                        return f"CSV parse error for Translate: {e}", None, None, None
+                elif fx.endswith((".xls", ".xlsx")):
+                    try:
+                        df_xl = parse_excel_file_to_df(file_up)
+                        combined_text += "\n" + df_xl.to_csv(index=False)
+                    except Exception as e:
+                        return f"Excel parse error for Translate: {e}", None, None, None
             translated = translate_text(combined_text, translation_opt)
             return translated, None, None, None
         elif action == "Perform Named Entity Recognition":
+            if file_up:
+                fx = file_up.name.lower()
+                if fx.endswith(".csv"):
+                    try:
+                        df_csv = parse_csv_file_to_df(file_up)
+                        combined_text += "\n" + df_csv.to_csv(index=False)
+                    except Exception as e:
+                        return f"CSV parse error for NER: {e}", None, None, None
+                elif fx.endswith((".xls", ".xlsx")):
+                    try:
+                        df_xl = parse_excel_file_to_df(file_up)
+                        combined_text += "\n" + df_xl.to_csv(index=False)
+                    except Exception as e:
+                        return f"Excel parse error for NER: {e}", None, None, None
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
         elif action == "Perform Enhanced EDA":
+            return await _action_eda(file_up, txt)
         elif action == "Fetch Clinical Studies":
             if nct_id:
                 result = await fetch_articles_by_nct_id(nct_id)
+            elif query_str:
+                result = await fetch_articles_by_query(query_str)
             else:
                 return "Provide either an NCT ID or valid query parameters.", None, None, None
             return formatted_results, None, None, None
         elif action in ["Fetch PubMed Articles (Legacy)", "Fetch PubMed by Query"]:
+            pubmed_result = await fetch_pubmed_by_query(query_str)
             xml_data = pubmed_result.get("result")
             if xml_data:
                 articles = parse_pubmed_xml(xml_data)
             return "No articles found or error fetching data.", None, None, None
         elif action == "Fetch Crossref by Query":
+            crossref_result = await fetch_crossref_by_query(query_str)
             items = crossref_result.get("message", {}).get("items", [])
             if not items:
                 return "No results found.", None, None, None
             return formatted, None, None, None
         return "Invalid action.", None, None, None
+    async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
         """
+        Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV.
         """
+        if file_up is None and not raw_text.strip():
             return "No data provided for EDA.", None, None, None
+        # If a file is present
+        if file_up is not None:
+            ext = os.path.splitext(file_up.name)[1].lower()
+            if ext == ".csv":
                 try:
+                    df = parse_csv_file_to_df(file_up)
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
                     return f"CSV EDA failed: {e}", None, None, None
+            elif ext in [".xls", ".xlsx"]:
                 try:
+                    df = parse_excel_file_to_df(file_up)
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
                     return f"Excel EDA failed: {e}", None, None, None
             else:
+                return "No valid CSV/Excel data for EDA.", None, None, None
         else:
+            # If no file, maybe user pasted CSV text
             if "," in raw_text:
+                from io import StringIO
                 try:
+                    df = pd.read_csv(StringIO(raw_text))
+                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
                     return eda_summary, corr_chart, dist_chart, None
                 except Exception as e:
+                    return f"Text-based CSV parse error: {e}", None, None, None
             return "No valid CSV/Excel data found for EDA.", None, None, None
+    submit_btn.click(
+        fn=handle_action,
+        inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],
+        outputs=[output_text, output_chart, output_chart2, output_file],
     )
 demo.launch(server_name="0.0.0.0", server_port=7860, share=True)