swarit222 commited on
Commit
02e2bed
·
verified ·
1 Parent(s): 1a6054d

Update main2.py

Browse files
Files changed (1) hide show
  1. main2.py +88 -32
main2.py CHANGED
@@ -1,35 +1,92 @@
1
  import pandas as pd
 
 
 
2
 
3
- def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clinical_trials_cleaned_merged.csv"):
4
- """
5
- Search for recruiting US clinical trials matching the user's demographics & optional keywords.
6
- Returns ALL available columns from the dataset.
7
- """
8
 
9
- # === Load dataset ===
10
- df = pd.read_csv(csv_path)
 
 
 
 
 
 
11
 
12
- # Drop missing critical columns
13
- df = df.dropna(subset=["MinimumAge", "MaximumAge", "Sex", "OverallStatus"])
 
 
14
 
15
- # Keep only US & recruiting trials
16
- df = df[df["LocationCountry"] == "United States"]
17
- df = df[df["OverallStatus"].str.lower() == "recruiting"]
18
 
19
- # Convert ages to numeric
20
- def parse_age(age_str):
21
- if pd.isnull(age_str):
22
- return None
23
- parts = str(age_str).split()
24
- try:
25
- return int(parts[0])
26
- except:
27
- return None
28
 
29
- df["MinAgeNum"] = df["MinimumAge"].apply(parse_age)
30
- df["MaxAgeNum"] = df["MaximumAge"].apply(parse_age)
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Prepare user's keywords list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if isinstance(user_keywords, str):
34
  keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
35
  elif isinstance(user_keywords, list):
@@ -37,23 +94,22 @@ def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clini
37
  else:
38
  keywords = []
39
 
40
- # === Create masks ===
41
  sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
42
  age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
43
  state_mask = df["LocationState"].str.lower() == str(user_state).lower()
44
 
45
  if keywords:
46
- def row_matches_any_keyword(row):
47
- row_as_str = " ".join(str(x).lower() for x in row.values if pd.notnull(x))
48
- return any(k in row_as_str for k in keywords)
49
- keyword_mask = df.apply(row_matches_any_keyword, axis=1)
50
  else:
51
  keyword_mask = True
52
 
53
- # Apply all filters and return ALL columns
54
  filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
 
55
 
56
- # Drop helper numeric age cols if you don’t want them visible
57
- filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum"], errors="ignore")
 
 
 
58
 
59
  return filtered_df
 
1
  import pandas as pd
2
+ import re
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ import numpy as np
5
 
6
+ # Load & preprocess dataset once (global)
7
+ print("Loading and preprocessing dataset...")
8
+ df_full = pd.read_csv("clinical_trials_cleaned_merged.csv")
 
 
9
 
10
+ def parse_age(age_str):
11
+ if pd.isnull(age_str):
12
+ return None
13
+ parts = str(age_str).split()
14
+ try:
15
+ return int(parts[0])
16
+ except:
17
+ return None
18
 
19
+ df_full["MinAgeNum"] = df_full["MinimumAge"].apply(parse_age)
20
+ df_full["MaxAgeNum"] = df_full["MaximumAge"].apply(parse_age)
21
+ df_full["combined_text"] = df_full.astype(str).agg(" ".join, axis=1).str.lower()
22
+ print(f"Preprocessed {len(df_full)} US recruiting trials.")
23
 
24
+ def search_trials(user_age, user_sex, user_state, user_keywords, generate_summaries=True):
25
+ # Local helpers inside the function
 
26
 
27
+ def split_sentences(text):
28
+ # Improved sentence splitter
29
+ return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
 
 
 
 
 
 
30
 
31
+ def build_input_text(row):
32
+ text_parts = [
33
+ f"Intervention Name: {row.get('InterventionName', '')}",
34
+ f"Intervention Description: {row.get('InterventionDescription', '')}",
35
+ f"Brief Summary: {row.get('BriefSummary', '')}",
36
+ f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
37
+ f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
38
+ f"Start Date: {row.get('StartDate', '')}",
39
+ f"Detailed Description: {row.get('DetailedDescription', '')}",
40
+ f"Eligibility Criteria: {row.get('EligibilityCriteria', '')}"
41
+ ]
42
+ return " ".join([part for part in text_parts if part.strip()])
43
 
44
+ def generate_summary(row, max_sentences=7, min_sentence_length=5):
45
+ text = build_input_text(row)
46
+ if not text.strip():
47
+ return ""
48
+
49
+ sentences = split_sentences(text)
50
+ # Filter out very short sentences
51
+ sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
52
+ if not sentences:
53
+ return ""
54
+
55
+ if len(sentences) <= max_sentences:
56
+ return " ".join(sentences)
57
+
58
+ vectorizer = TfidfVectorizer(stop_words="english")
59
+ tfidf_matrix = vectorizer.fit_transform(sentences)
60
+ scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
61
+
62
+ # Position weighting: earlier sentences weighted higher
63
+ position_weights = np.linspace(1.5, 1.0, num=len(sentences))
64
+ combined_scores = scores * position_weights
65
+
66
+ top_indices = combined_scores.argsort()[-max_sentences:][::-1]
67
+ top_indices = sorted(top_indices) # keep original order
68
+
69
+ summary_sentences = []
70
+ for i in top_indices:
71
+ s = sentences[i]
72
+ # Skip sentences that look like metadata labels
73
+ if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
74
+ continue
75
+ summary_sentences.append(s)
76
+
77
+ # If filtered too aggressively, add back more sentences from top indices
78
+ if len(summary_sentences) < max_sentences:
79
+ for i in top_indices:
80
+ if len(summary_sentences) >= max_sentences:
81
+ break
82
+ if sentences[i] not in summary_sentences:
83
+ summary_sentences.append(sentences[i])
84
+
85
+ return " ".join(summary_sentences[:max_sentences])
86
+
87
+ df = df_full.copy()
88
+
89
+ # Prepare keywords list
90
  if isinstance(user_keywords, str):
91
  keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
92
  elif isinstance(user_keywords, list):
 
94
  else:
95
  keywords = []
96
 
 
97
  sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
98
  age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
99
  state_mask = df["LocationState"].str.lower() == str(user_state).lower()
100
 
101
  if keywords:
102
+ keyword_mask = df["combined_text"].apply(lambda txt: any(k in txt for k in keywords))
 
 
 
103
  else:
104
  keyword_mask = True
105
 
 
106
  filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
107
+ filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum", "combined_text"], errors="ignore")
108
 
109
+ if generate_summaries and len(filtered_df) > 0:
110
+ print(f"Generating improved fast extractive summaries for {len(filtered_df)} filtered trials...")
111
+ filtered_df["LaymanSummary"] = filtered_df.apply(generate_summary, axis=1)
112
+ else:
113
+ filtered_df["LaymanSummary"] = ""
114
 
115
  return filtered_df