Spaces:

hack4hope
/

model

Sleeping

File size: 4,729 Bytes

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load & preprocess dataset once (global)
print("Loading and preprocessing dataset...")
df_full = pd.read_csv("clinical_trials_cleaned_merged.csv")

def parse_age(age_str):
    if pd.isnull(age_str):
        return None
    parts = str(age_str).split()
    try:
        return int(parts[0])
    except:
        return None

df_full["MinAgeNum"] = df_full["MinimumAge"].apply(parse_age)
df_full["MaxAgeNum"] = df_full["MaximumAge"].apply(parse_age)
df_full["combined_text"] = df_full.astype(str).agg(" ".join, axis=1).str.lower()
print(f"Preprocessed {len(df_full)} US recruiting trials.")

def search_trials(user_age, user_sex, user_state, user_keywords, generate_summaries=True):
    # Local helpers inside the function

    def split_sentences(text):
        # Improved sentence splitter
        return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]

    def build_input_text(row):
        text_parts = [
            f"Intervention Name: {row.get('InterventionName', '')}",
            f"Intervention Description: {row.get('InterventionDescription', '')}",
            f"Brief Summary: {row.get('BriefSummary', '')}",
            f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
            f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
            f"Start Date: {row.get('StartDate', '')}",
            f"Detailed Description: {row.get('DetailedDescription', '')}",
            f"Eligibility Criteria: {row.get('EligibilityCriteria', '')}"
        ]
        return " ".join([part for part in text_parts if part.strip()])

    def generate_summary(row, max_sentences=7, min_sentence_length=5):
        text = build_input_text(row)
        if not text.strip():
            return ""

        sentences = split_sentences(text)
        # Filter out very short sentences
        sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
        if not sentences:
            return ""

        if len(sentences) <= max_sentences:
            return " ".join(sentences)

        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform(sentences)
        scores = np.array(tfidf_matrix.sum(axis=1)).flatten()

        # Position weighting: earlier sentences weighted higher
        position_weights = np.linspace(1.5, 1.0, num=len(sentences))
        combined_scores = scores * position_weights

        top_indices = combined_scores.argsort()[-max_sentences:][::-1]
        top_indices = sorted(top_indices)  # keep original order

        summary_sentences = []
        for i in top_indices:
            s = sentences[i]
            # Skip sentences that look like metadata labels
            if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
                continue
            summary_sentences.append(s)

        # If filtered too aggressively, add back more sentences from top indices
        if len(summary_sentences) < max_sentences:
            for i in top_indices:
                if len(summary_sentences) >= max_sentences:
                    break
                if sentences[i] not in summary_sentences:
                    summary_sentences.append(sentences[i])

        return " ".join(summary_sentences[:max_sentences])

    df = df_full.copy()

    # Prepare keywords list
    if isinstance(user_keywords, str):
        keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
    elif isinstance(user_keywords, list):
        keywords = [str(k).strip().lower() for k in user_keywords if str(k).strip()]
    else:
        keywords = []

    sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
    age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
    state_mask = df["LocationState"].str.lower() == str(user_state).lower()

    if keywords:
        keyword_mask = df["combined_text"].apply(lambda txt: any(k in txt for k in keywords))
    else:
        keyword_mask = True

    filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
    filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum", "combined_text"], errors="ignore")

    if generate_summaries and len(filtered_df) > 0:
        print(f"Generating improved fast extractive summaries for {len(filtered_df)} filtered trials...")
        filtered_df["LaymanSummary"] = filtered_df.apply(generate_summary, axis=1)
    else:
        filtered_df["LaymanSummary"] = ""

    return filtered_df