Spaces:

hack4hope
/

model

Sleeping

App Files Files Community

swarit222 commited on Aug 9, 2025

Commit

02e2bed

verified ·

1 Parent(s): 1a6054d

Update main2.py

Browse files

Files changed (1) hide show

main2.py +88 -32

main2.py CHANGED Viewed

@@ -1,35 +1,92 @@
 import pandas as pd
-def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clinical_trials_cleaned_merged.csv"):
-    """
-    Search for recruiting US clinical trials matching the user's demographics & optional keywords.
-    Returns ALL available columns from the dataset.
-    """
-    # === Load dataset ===
-    df = pd.read_csv(csv_path)
-    # Drop missing critical columns
-    df = df.dropna(subset=["MinimumAge", "MaximumAge", "Sex", "OverallStatus"])
-    # Keep only US & recruiting trials
-    df = df[df["LocationCountry"] == "United States"]
-    df = df[df["OverallStatus"].str.lower() == "recruiting"]
-    # Convert ages to numeric
-    def parse_age(age_str):
-        if pd.isnull(age_str):
-            return None
-        parts = str(age_str).split()
-        try:
-            return int(parts[0])
-        except:
-            return None
-    df["MinAgeNum"] = df["MinimumAge"].apply(parse_age)
-    df["MaxAgeNum"] = df["MaximumAge"].apply(parse_age)
-    # Prepare user's keywords list
     if isinstance(user_keywords, str):
         keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
     elif isinstance(user_keywords, list):
@@ -37,23 +94,22 @@ def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clini
     else:
         keywords = []
-    # === Create masks ===
     sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
     age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
     state_mask = df["LocationState"].str.lower() == str(user_state).lower()
     if keywords:
-        def row_matches_any_keyword(row):
-            row_as_str = " ".join(str(x).lower() for x in row.values if pd.notnull(x))
-            return any(k in row_as_str for k in keywords)
-        keyword_mask = df.apply(row_matches_any_keyword, axis=1)
     else:
         keyword_mask = True
-    # Apply all filters and return ALL columns
     filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
-    # Drop helper numeric age cols if you don’t want them visible
-    filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum"], errors="ignore")
     return filtered_df

 import pandas as pd
+import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+# Load & preprocess dataset once (global)
+print("Loading and preprocessing dataset...")
+df_full = pd.read_csv("clinical_trials_cleaned_merged.csv")
+def parse_age(age_str):
+    if pd.isnull(age_str):
+        return None
+    parts = str(age_str).split()
+    try:
+        return int(parts[0])
+    except:
+        return None
+df_full["MinAgeNum"] = df_full["MinimumAge"].apply(parse_age)
+df_full["MaxAgeNum"] = df_full["MaximumAge"].apply(parse_age)
+df_full["combined_text"] = df_full.astype(str).agg(" ".join, axis=1).str.lower()
+print(f"Preprocessed {len(df_full)} US recruiting trials.")
+def search_trials(user_age, user_sex, user_state, user_keywords, generate_summaries=True):
+    # Local helpers inside the function
+    def split_sentences(text):
+        # Improved sentence splitter
+        return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
+    def build_input_text(row):
+        text_parts = [
+            f"Intervention Name: {row.get('InterventionName', '')}",
+            f"Intervention Description: {row.get('InterventionDescription', '')}",
+            f"Brief Summary: {row.get('BriefSummary', '')}",
+            f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
+            f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
+            f"Start Date: {row.get('StartDate', '')}",
+            f"Detailed Description: {row.get('DetailedDescription', '')}",
+            f"Eligibility Criteria: {row.get('EligibilityCriteria', '')}"
+        ]
+        return " ".join([part for part in text_parts if part.strip()])
+    def generate_summary(row, max_sentences=7, min_sentence_length=5):
+        text = build_input_text(row)
+        if not text.strip():
+            return ""
+        sentences = split_sentences(text)
+        # Filter out very short sentences
+        sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
+        if not sentences:
+            return ""
+        if len(sentences) <= max_sentences:
+            return " ".join(sentences)
+        vectorizer = TfidfVectorizer(stop_words="english")
+        tfidf_matrix = vectorizer.fit_transform(sentences)
+        scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
+        # Position weighting: earlier sentences weighted higher
+        position_weights = np.linspace(1.5, 1.0, num=len(sentences))
+        combined_scores = scores * position_weights
+        top_indices = combined_scores.argsort()[-max_sentences:][::-1]
+        top_indices = sorted(top_indices)  # keep original order
+        summary_sentences = []
+        for i in top_indices:
+            s = sentences[i]
+            # Skip sentences that look like metadata labels
+            if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
+                continue
+            summary_sentences.append(s)
+        # If filtered too aggressively, add back more sentences from top indices
+        if len(summary_sentences) < max_sentences:
+            for i in top_indices:
+                if len(summary_sentences) >= max_sentences:
+                    break
+                if sentences[i] not in summary_sentences:
+                    summary_sentences.append(sentences[i])
+        return " ".join(summary_sentences[:max_sentences])
+    df = df_full.copy()
+    # Prepare keywords list
     if isinstance(user_keywords, str):
         keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
     elif isinstance(user_keywords, list):
     else:
         keywords = []
     sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
     age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
     state_mask = df["LocationState"].str.lower() == str(user_state).lower()
     if keywords:
+        keyword_mask = df["combined_text"].apply(lambda txt: any(k in txt for k in keywords))
     else:
         keyword_mask = True
     filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
+    filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum", "combined_text"], errors="ignore")
+    if generate_summaries and len(filtered_df) > 0:
+        print(f"Generating improved fast extractive summaries for {len(filtered_df)} filtered trials...")
+        filtered_df["LaymanSummary"] = filtered_df.apply(generate_summary, axis=1)
+    else:
+        filtered_df["LaymanSummary"] = ""
     return filtered_df