Spaces:

hack4hope
/

model

Sleeping

App Files Files Community

prHack4Hope

by aerf3gf - opened Aug 9, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+52

-317

This PR is in draft mode

Files changed (3) hide show

app.py +20 -227
main2.py +32 -88
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -1,245 +1,38 @@
 import gradio as gr
-import pandas as pd
-import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-import numpy as np
-from main2 import search_trials  # Import your updated search_trials
-PAGE_SIZE = 5
-PREVIEW_WORDS = 100  # Number of words in collapsed preview
-US_STATES = [
-    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
-    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
-    "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
-    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico",
-    "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
-    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
-    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "District of Columbia"
-]
-def split_sentences(text):
-    return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
-def build_input_text(row):
-    text_parts = [
-        f"Brief Summary: {row.get('BriefSummary', '')}",
-        f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
-        f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
-        f"Primary Completion Date: {row.get('PrimaryCompletionDate', '')}"
-    ]
-    return " ".join([part for part in text_parts if part.strip()])
-def generate_summary(row, max_sentences=7, min_sentence_length=5):
-    text = build_input_text(row)
-    if not text.strip():
-        return ""
-    sentences = split_sentences(text)
-    sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
-    if not sentences:
-        return ""
-    if len(sentences) <= max_sentences:
-        return " ".join(sentences)
-    vectorizer = TfidfVectorizer(stop_words="english")
-    tfidf_matrix = vectorizer.fit_transform(sentences)
-    scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
-    position_weights = np.linspace(1.5, 1.0, num=len(sentences))
-    combined_scores = scores * position_weights
-    top_indices = combined_scores.argsort()[-max_sentences:][::-1]
-    top_indices = sorted(top_indices)
-    summary_sentences = []
-    for i in top_indices:
-        s = sentences[i]
-        if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
-            continue
-        summary_sentences.append(s)
-    if len(summary_sentences) < max_sentences:
-        for i in top_indices:
-            if len(summary_sentences) >= max_sentences:
-                break
-            if sentences[i] not in summary_sentences:
-                summary_sentences.append(sentences[i])
-    return " ".join(summary_sentences[:max_sentences])
 def run_search(age, sex, state, keywords):
-    df = search_trials(
         user_age=age,
         user_sex=sex,
         user_state=state,
-        user_keywords=keywords,
-        generate_summaries=False
     )
-    if df.empty:
-        return pd.DataFrame(), 0, None
-    total_pages = (len(df) + PAGE_SIZE - 1) // PAGE_SIZE
-    page_df = df.iloc[:PAGE_SIZE].copy()
-    page_df['LaymanSummary'] = ""
-    return page_df, total_pages, df
-def load_page(page_num, full_df):
-    if full_df is None or full_df.empty:
-        return pd.DataFrame()
-    start = page_num * PAGE_SIZE
-    end = start + PAGE_SIZE
-    page_df = full_df.iloc[start:end].copy()
-    page_df['LaymanSummary'] = page_df.apply(generate_summary, axis=1)
-    return page_df
-def update_page_controls(page_num, total_pages):
-    prev_visible = gr.update(visible=page_num > 0)
-    next_visible = gr.update(visible=page_num < total_pages - 1)
-    page_text = f"Page {page_num + 1} of {total_pages}" if total_pages > 0 else ""
-    return prev_visible, next_visible, page_text
-def hide_empty_columns(df):
-    cols_to_keep = []
-    for col in df.columns:
-        col_values = df[col].dropna().astype(str).str.strip()
-        if not col_values.empty and any(val != "" for val in col_values):
-            cols_to_keep.append(col)
-    return df[cols_to_keep]
-def df_to_html_with_readmore(df: pd.DataFrame) -> str:
-    if df.empty:
-        return "<p>No matching trials found.</p>"
-    from html import escape
-    if "LaymanSummary" in df.columns:
-        cols = list(df.columns)
-        cols.insert(0, cols.pop(cols.index("LaymanSummary")))
-        df = df[cols]
-    df = hide_empty_columns(df)
-    html = ['''
-    <style>
-        table {
-            width: 100%;
-            border-collapse: collapse;
-            font-family: Arial, sans-serif;
-        }
-        th {
-            background-color: #007bff;
-            color: white;
-            padding: 12px;
-            text-align: left;
-            border: 1px solid #ddd;
-        }
-        td {
-            border: 1px solid #ddd;
-            padding: 12px;
-            vertical-align: top;
-            white-space: normal;
-            max-width: 1000px; /* 2.5x original 400px */
-            min-width: 1000px; /* force width */
-            word-wrap: break-word;
-        }
-        details summary {
-            cursor: pointer;
-            color: #007bff;
-            font-weight: bold;
-        }
-        details summary:after {
-            content: " (Read More)";
-            color: #0056b3;
-            font-weight: normal;
-        }
-        details[open] summary {
-            display: none; /* hide preview when expanded */
-        }
-        details div.full-text {
-            display: none;
-        }
-        details[open] div.full-text {
-            display: block;
-            margin-top: 8px;
-        }
-    </style>
-    ''']
-    html.append('<table><thead><tr>')
-    for col in df.columns:
-        html.append(f'<th>{escape(col)}</th>')
-    html.append('</tr></thead><tbody>')
-    for _, row in df.iterrows():
-        html.append('<tr>')
-        for col in df.columns:
-            val = str(row[col])
-            words = val.split()
-            if len(words) > PREVIEW_WORDS:
-                short_text = escape(" ".join(words[:PREVIEW_WORDS]) + "...")
-                full_text = escape(val)
-                cell_html = f'''
-                <div>
-                    <details>
-                        <summary>{short_text}</summary>
-                        <div class="full-text">{full_text}</div>
-                    </details>
-                </div>
-                '''
-            else:
-                cell_html = f'<div>{escape(val)}</div>'
-            html.append(f'<td>{cell_html}</td>')
-        html.append('</tr>')
-    html.append('</tbody></table>')
-    return "".join(html)
-def on_search(age, sex, state, keywords):
-    df_page, total_pages, full_df = run_search(age, sex, state, keywords)
-    page_num = 0
-    if not df_page.empty:
-        df_page = load_page(page_num, full_df)
-    prev_vis, next_vis, page_text = update_page_controls(page_num, total_pages)
-    html_output = df_to_html_with_readmore(df_page)
-    return html_output, page_text, prev_vis, next_vis, page_num, total_pages, full_df, gr.update(visible=False), gr.update(visible=True)
-def on_page_change(increment, page_num, total_pages, full_df):
-    if full_df is None or full_df.empty:
-        return "<p>No matching trials found.</p>", "", gr.update(visible=False), gr.update(visible=False), 0
-    new_page = max(0, min(page_num + increment, total_pages - 1))
-    page_df = load_page(new_page, full_df)
-    prev_vis, next_vis, page_text = update_page_controls(new_page, total_pages)
-    html_output = df_to_html_with_readmore(page_df)
-    return html_output, page_text, prev_vis, next_vis, new_page
-def show_input_page():
-    return gr.update(visible=True), gr.update(visible=False)
-with gr.Blocks() as demo:
-    gr.Markdown("# Clinical Trials Search Tool with Pagination and Inline Read More")
-    with gr.Column(visible=True) as input_page:
-        gr.Markdown("Find **recruiting US clinical trials** that match your **age**, **sex**, **state**, and optional **keywords**.")
-        with gr.Row():
-            age_input = gr.Number(label="Your Age", value=30)
-            sex_input = gr.Dropdown(["Male", "Female", "All"], label="Sex", value="All")
-        with gr.Row():
-            state_input = gr.Dropdown(US_STATES, label="State", value="California")
-            keywords_input = gr.Textbox(label="Keywords", placeholder="e.g., Cancer, Diabetes")
-        search_btn = gr.Button("Search Trials")
-    with gr.Column(visible=False) as results_page:
-        output_html = gr.HTML()
-        total_pages_text = gr.Textbox(value="", interactive=False)
-        with gr.Row():
-            prev_btn = gr.Button("Previous Page")
-            next_btn = gr.Button("Next Page")
-            back_btn = gr.Button("Back")
-    page_num_state = gr.State(0)
-    total_pages_state = gr.State(0)
-    full_results_state = gr.State(None)
     search_btn.click(
-        fn=on_search,
         inputs=[age_input, sex_input, state_input, keywords_input],
-        outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state, total_pages_state, full_results_state, input_page, results_page]
-    )
-    next_btn.click(
-        fn=on_page_change,
-        inputs=[gr.State(1), page_num_state, total_pages_state, full_results_state],
-        outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state]
-    )
-    prev_btn.click(
-        fn=on_page_change,
-        inputs=[gr.State(-1), page_num_state, total_pages_state, full_results_state],
-        outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state]
-    )
-    back_btn.click(
-        fn=show_input_page,
-        outputs=[input_page, results_page]
     )
 if __name__ == "__main__":

 import gradio as gr
+from main2 import search_trials  # Importing from main2.py
 def run_search(age, sex, state, keywords):
+    results = search_trials(
         user_age=age,
         user_sex=sex,
         user_state=state,
+        user_keywords=keywords
     )
+    return results
+with gr.Blocks() as demo:
+    gr.Markdown("#  Clinical Trials Search Tool")
+    gr.Markdown(
+        "Find **recruiting US clinical trials** that match your **age**, **sex**, "
+        "**state**, and optional **keywords**."
+    )
+    with gr.Row():
+        age_input = gr.Number(label="Your Age", value=30)
+        sex_input = gr.Dropdown(["Male", "Female"], label="Sex", value="Male")
+    with gr.Row():
+        state_input = gr.Textbox(label="State (full name or abbreviation)", placeholder="e.g., California")
+        keywords_input = gr.Textbox(label="Keywords (comma separated)", placeholder="e.g., cancer, diabetes")
+    search_btn = gr.Button("Search Trials")
+    output_table = gr.Dataframe(label="Matching Trials", interactive=False)
     search_btn.click(
+        fn=run_search,
         inputs=[age_input, sex_input, state_input, keywords_input],
+        outputs=output_table
     )
 if __name__ == "__main__":

main2.py CHANGED Viewed

@@ -1,92 +1,35 @@
 import pandas as pd
-import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-import numpy as np
-# Load & preprocess dataset once (global)
-print("Loading and preprocessing dataset...")
-df_full = pd.read_csv("clinical_trials_cleaned_merged.csv")
-def parse_age(age_str):
-    if pd.isnull(age_str):
-        return None
-    parts = str(age_str).split()
-    try:
-        return int(parts[0])
-    except:
-        return None
-df_full["MinAgeNum"] = df_full["MinimumAge"].apply(parse_age)
-df_full["MaxAgeNum"] = df_full["MaximumAge"].apply(parse_age)
-df_full["combined_text"] = df_full.astype(str).agg(" ".join, axis=1).str.lower()
-print(f"Preprocessed {len(df_full)} US recruiting trials.")
-def search_trials(user_age, user_sex, user_state, user_keywords, generate_summaries=True):
-    # Local helpers inside the function
-    def split_sentences(text):
-        # Improved sentence splitter
-        return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
-    def build_input_text(row):
-        text_parts = [
-            f"Intervention Name: {row.get('InterventionName', '')}",
-            f"Intervention Description: {row.get('InterventionDescription', '')}",
-            f"Brief Summary: {row.get('BriefSummary', '')}",
-            f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
-            f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
-            f"Start Date: {row.get('StartDate', '')}",
-            f"Detailed Description: {row.get('DetailedDescription', '')}",
-            f"Eligibility Criteria: {row.get('EligibilityCriteria', '')}"
-        ]
-        return " ".join([part for part in text_parts if part.strip()])
-    def generate_summary(row, max_sentences=7, min_sentence_length=5):
-        text = build_input_text(row)
-        if not text.strip():
-            return ""
-        sentences = split_sentences(text)
-        # Filter out very short sentences
-        sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
-        if not sentences:
-            return ""
-        if len(sentences) <= max_sentences:
-            return " ".join(sentences)
-        vectorizer = TfidfVectorizer(stop_words="english")
-        tfidf_matrix = vectorizer.fit_transform(sentences)
-        scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
-        # Position weighting: earlier sentences weighted higher
-        position_weights = np.linspace(1.5, 1.0, num=len(sentences))
-        combined_scores = scores * position_weights
-        top_indices = combined_scores.argsort()[-max_sentences:][::-1]
-        top_indices = sorted(top_indices)  # keep original order
-        summary_sentences = []
-        for i in top_indices:
-            s = sentences[i]
-            # Skip sentences that look like metadata labels
-            if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
-                continue
-            summary_sentences.append(s)
-        # If filtered too aggressively, add back more sentences from top indices
-        if len(summary_sentences) < max_sentences:
-            for i in top_indices:
-                if len(summary_sentences) >= max_sentences:
-                    break
-                if sentences[i] not in summary_sentences:
-                    summary_sentences.append(sentences[i])
-        return " ".join(summary_sentences[:max_sentences])
-    df = df_full.copy()
-    # Prepare keywords list
     if isinstance(user_keywords, str):
         keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
     elif isinstance(user_keywords, list):
@@ -94,22 +37,23 @@ def search_trials(user_age, user_sex, user_state, user_keywords, generate_summar
     else:
         keywords = []
     sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
     age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
     state_mask = df["LocationState"].str.lower() == str(user_state).lower()
     if keywords:
-        keyword_mask = df["combined_text"].apply(lambda txt: any(k in txt for k in keywords))
     else:
         keyword_mask = True
     filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
-    filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum", "combined_text"], errors="ignore")
-    if generate_summaries and len(filtered_df) > 0:
-        print(f"Generating improved fast extractive summaries for {len(filtered_df)} filtered trials...")
-        filtered_df["LaymanSummary"] = filtered_df.apply(generate_summary, axis=1)
-    else:
-        filtered_df["LaymanSummary"] = ""
     return filtered_df

 import pandas as pd
+def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clinical_trials_cleaned_merged.csv"):
+    """
+    Search for recruiting US clinical trials matching the user's demographics & optional keywords.
+    Returns ALL available columns from the dataset.
+    """
+    # === Load dataset ===
+    df = pd.read_csv(csv_path)
+    # Drop missing critical columns
+    df = df.dropna(subset=["MinimumAge", "MaximumAge", "Sex", "OverallStatus"])
+    # Keep only US & recruiting trials
+    df = df[df["LocationCountry"] == "United States"]
+    df = df[df["OverallStatus"].str.lower() == "recruiting"]
+    # Convert ages to numeric
+    def parse_age(age_str):
+        if pd.isnull(age_str):
+            return None
+        parts = str(age_str).split()
+        try:
+            return int(parts[0])
+        except:
+            return None
+    df["MinAgeNum"] = df["MinimumAge"].apply(parse_age)
+    df["MaxAgeNum"] = df["MaximumAge"].apply(parse_age)
+    # Prepare user's keywords list
     if isinstance(user_keywords, str):
         keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
     elif isinstance(user_keywords, list):
     else:
         keywords = []
+    # === Create masks ===
     sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
     age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
     state_mask = df["LocationState"].str.lower() == str(user_state).lower()
     if keywords:
+        def row_matches_any_keyword(row):
+            row_as_str = " ".join(str(x).lower() for x in row.values if pd.notnull(x))
+            return any(k in row_as_str for k in keywords)
+        keyword_mask = df.apply(row_matches_any_keyword, axis=1)
     else:
         keyword_mask = True
+    # Apply all filters and return ALL columns
     filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
+    # Drop helper numeric age cols if you don’t want them visible
+    filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum"], errors="ignore")
     return filtered_df

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
 gradio
 pandas
 requests
-scikit-learn
-numpy

 gradio
 pandas
 requests