prHack4Hope

#1
by aerf3gf - opened
Files changed (3) hide show
  1. app.py +20 -227
  2. main2.py +32 -88
  3. requirements.txt +0 -2
app.py CHANGED
@@ -1,245 +1,38 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import re
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- import numpy as np
6
- from main2 import search_trials # Import your updated search_trials
7
-
8
- PAGE_SIZE = 5
9
- PREVIEW_WORDS = 100 # Number of words in collapsed preview
10
-
11
- US_STATES = [
12
- "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware",
13
- "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
14
- "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
15
- "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico",
16
- "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
17
- "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
18
- "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "District of Columbia"
19
- ]
20
-
21
- def split_sentences(text):
22
- return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
23
-
24
- def build_input_text(row):
25
- text_parts = [
26
- f"Brief Summary: {row.get('BriefSummary', '')}",
27
- f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
28
- f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
29
- f"Primary Completion Date: {row.get('PrimaryCompletionDate', '')}"
30
- ]
31
- return " ".join([part for part in text_parts if part.strip()])
32
-
33
- def generate_summary(row, max_sentences=7, min_sentence_length=5):
34
- text = build_input_text(row)
35
- if not text.strip():
36
- return ""
37
- sentences = split_sentences(text)
38
- sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
39
- if not sentences:
40
- return ""
41
- if len(sentences) <= max_sentences:
42
- return " ".join(sentences)
43
- vectorizer = TfidfVectorizer(stop_words="english")
44
- tfidf_matrix = vectorizer.fit_transform(sentences)
45
- scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
46
- position_weights = np.linspace(1.5, 1.0, num=len(sentences))
47
- combined_scores = scores * position_weights
48
- top_indices = combined_scores.argsort()[-max_sentences:][::-1]
49
- top_indices = sorted(top_indices)
50
- summary_sentences = []
51
- for i in top_indices:
52
- s = sentences[i]
53
- if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
54
- continue
55
- summary_sentences.append(s)
56
- if len(summary_sentences) < max_sentences:
57
- for i in top_indices:
58
- if len(summary_sentences) >= max_sentences:
59
- break
60
- if sentences[i] not in summary_sentences:
61
- summary_sentences.append(sentences[i])
62
- return " ".join(summary_sentences[:max_sentences])
63
 
64
  def run_search(age, sex, state, keywords):
65
- df = search_trials(
66
  user_age=age,
67
  user_sex=sex,
68
  user_state=state,
69
- user_keywords=keywords,
70
- generate_summaries=False
71
  )
72
- if df.empty:
73
- return pd.DataFrame(), 0, None
74
- total_pages = (len(df) + PAGE_SIZE - 1) // PAGE_SIZE
75
- page_df = df.iloc[:PAGE_SIZE].copy()
76
- page_df['LaymanSummary'] = ""
77
- return page_df, total_pages, df
78
-
79
- def load_page(page_num, full_df):
80
- if full_df is None or full_df.empty:
81
- return pd.DataFrame()
82
- start = page_num * PAGE_SIZE
83
- end = start + PAGE_SIZE
84
- page_df = full_df.iloc[start:end].copy()
85
- page_df['LaymanSummary'] = page_df.apply(generate_summary, axis=1)
86
- return page_df
87
-
88
- def update_page_controls(page_num, total_pages):
89
- prev_visible = gr.update(visible=page_num > 0)
90
- next_visible = gr.update(visible=page_num < total_pages - 1)
91
- page_text = f"Page {page_num + 1} of {total_pages}" if total_pages > 0 else ""
92
- return prev_visible, next_visible, page_text
93
 
94
- def hide_empty_columns(df):
95
- cols_to_keep = []
96
- for col in df.columns:
97
- col_values = df[col].dropna().astype(str).str.strip()
98
- if not col_values.empty and any(val != "" for val in col_values):
99
- cols_to_keep.append(col)
100
- return df[cols_to_keep]
101
 
102
- def df_to_html_with_readmore(df: pd.DataFrame) -> str:
103
- if df.empty:
104
- return "<p>No matching trials found.</p>"
105
- from html import escape
106
- if "LaymanSummary" in df.columns:
107
- cols = list(df.columns)
108
- cols.insert(0, cols.pop(cols.index("LaymanSummary")))
109
- df = df[cols]
110
- df = hide_empty_columns(df)
111
- html = ['''
112
- <style>
113
- table {
114
- width: 100%;
115
- border-collapse: collapse;
116
- font-family: Arial, sans-serif;
117
- }
118
- th {
119
- background-color: #007bff;
120
- color: white;
121
- padding: 12px;
122
- text-align: left;
123
- border: 1px solid #ddd;
124
- }
125
- td {
126
- border: 1px solid #ddd;
127
- padding: 12px;
128
- vertical-align: top;
129
- white-space: normal;
130
- max-width: 1000px; /* 2.5x original 400px */
131
- min-width: 1000px; /* force width */
132
- word-wrap: break-word;
133
- }
134
- details summary {
135
- cursor: pointer;
136
- color: #007bff;
137
- font-weight: bold;
138
- }
139
- details summary:after {
140
- content: " (Read More)";
141
- color: #0056b3;
142
- font-weight: normal;
143
- }
144
- details[open] summary {
145
- display: none; /* hide preview when expanded */
146
- }
147
- details div.full-text {
148
- display: none;
149
- }
150
- details[open] div.full-text {
151
- display: block;
152
- margin-top: 8px;
153
- }
154
- </style>
155
- ''']
156
- html.append('<table><thead><tr>')
157
- for col in df.columns:
158
- html.append(f'<th>{escape(col)}</th>')
159
- html.append('</tr></thead><tbody>')
160
- for _, row in df.iterrows():
161
- html.append('<tr>')
162
- for col in df.columns:
163
- val = str(row[col])
164
- words = val.split()
165
- if len(words) > PREVIEW_WORDS:
166
- short_text = escape(" ".join(words[:PREVIEW_WORDS]) + "...")
167
- full_text = escape(val)
168
- cell_html = f'''
169
- <div>
170
- <details>
171
- <summary>{short_text}</summary>
172
- <div class="full-text">{full_text}</div>
173
- </details>
174
- </div>
175
- '''
176
- else:
177
- cell_html = f'<div>{escape(val)}</div>'
178
- html.append(f'<td>{cell_html}</td>')
179
- html.append('</tr>')
180
- html.append('</tbody></table>')
181
- return "".join(html)
182
 
183
- def on_search(age, sex, state, keywords):
184
- df_page, total_pages, full_df = run_search(age, sex, state, keywords)
185
- page_num = 0
186
- if not df_page.empty:
187
- df_page = load_page(page_num, full_df)
188
- prev_vis, next_vis, page_text = update_page_controls(page_num, total_pages)
189
- html_output = df_to_html_with_readmore(df_page)
190
- return html_output, page_text, prev_vis, next_vis, page_num, total_pages, full_df, gr.update(visible=False), gr.update(visible=True)
191
 
192
- def on_page_change(increment, page_num, total_pages, full_df):
193
- if full_df is None or full_df.empty:
194
- return "<p>No matching trials found.</p>", "", gr.update(visible=False), gr.update(visible=False), 0
195
- new_page = max(0, min(page_num + increment, total_pages - 1))
196
- page_df = load_page(new_page, full_df)
197
- prev_vis, next_vis, page_text = update_page_controls(new_page, total_pages)
198
- html_output = df_to_html_with_readmore(page_df)
199
- return html_output, page_text, prev_vis, next_vis, new_page
200
 
201
- def show_input_page():
202
- return gr.update(visible=True), gr.update(visible=False)
203
 
204
- with gr.Blocks() as demo:
205
- gr.Markdown("# Clinical Trials Search Tool with Pagination and Inline Read More")
206
- with gr.Column(visible=True) as input_page:
207
- gr.Markdown("Find **recruiting US clinical trials** that match your **age**, **sex**, **state**, and optional **keywords**.")
208
- with gr.Row():
209
- age_input = gr.Number(label="Your Age", value=30)
210
- sex_input = gr.Dropdown(["Male", "Female", "All"], label="Sex", value="All")
211
- with gr.Row():
212
- state_input = gr.Dropdown(US_STATES, label="State", value="California")
213
- keywords_input = gr.Textbox(label="Keywords", placeholder="e.g., Cancer, Diabetes")
214
- search_btn = gr.Button("Search Trials")
215
- with gr.Column(visible=False) as results_page:
216
- output_html = gr.HTML()
217
- total_pages_text = gr.Textbox(value="", interactive=False)
218
- with gr.Row():
219
- prev_btn = gr.Button("Previous Page")
220
- next_btn = gr.Button("Next Page")
221
- back_btn = gr.Button("Back")
222
- page_num_state = gr.State(0)
223
- total_pages_state = gr.State(0)
224
- full_results_state = gr.State(None)
225
  search_btn.click(
226
- fn=on_search,
227
  inputs=[age_input, sex_input, state_input, keywords_input],
228
- outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state, total_pages_state, full_results_state, input_page, results_page]
229
- )
230
- next_btn.click(
231
- fn=on_page_change,
232
- inputs=[gr.State(1), page_num_state, total_pages_state, full_results_state],
233
- outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state]
234
- )
235
- prev_btn.click(
236
- fn=on_page_change,
237
- inputs=[gr.State(-1), page_num_state, total_pages_state, full_results_state],
238
- outputs=[output_html, total_pages_text, prev_btn, next_btn, page_num_state]
239
- )
240
- back_btn.click(
241
- fn=show_input_page,
242
- outputs=[input_page, results_page]
243
  )
244
 
245
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from main2 import search_trials # Importing from main2.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def run_search(age, sex, state, keywords):
5
+ results = search_trials(
6
  user_age=age,
7
  user_sex=sex,
8
  user_state=state,
9
+ user_keywords=keywords
 
10
  )
11
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ with gr.Blocks() as demo:
14
+ gr.Markdown("# Clinical Trials Search Tool")
15
+ gr.Markdown(
16
+ "Find **recruiting US clinical trials** that match your **age**, **sex**, "
17
+ "**state**, and optional **keywords**."
18
+ )
 
19
 
20
+ with gr.Row():
21
+ age_input = gr.Number(label="Your Age", value=30)
22
+ sex_input = gr.Dropdown(["Male", "Female"], label="Sex", value="Male")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ with gr.Row():
25
+ state_input = gr.Textbox(label="State (full name or abbreviation)", placeholder="e.g., California")
26
+ keywords_input = gr.Textbox(label="Keywords (comma separated)", placeholder="e.g., cancer, diabetes")
 
 
 
 
 
27
 
28
+ search_btn = gr.Button("Search Trials")
 
 
 
 
 
 
 
29
 
30
+ output_table = gr.Dataframe(label="Matching Trials", interactive=False)
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  search_btn.click(
33
+ fn=run_search,
34
  inputs=[age_input, sex_input, state_input, keywords_input],
35
+ outputs=output_table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
 
38
  if __name__ == "__main__":
main2.py CHANGED
@@ -1,92 +1,35 @@
1
  import pandas as pd
2
- import re
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
- import numpy as np
5
 
6
- # Load & preprocess dataset once (global)
7
- print("Loading and preprocessing dataset...")
8
- df_full = pd.read_csv("clinical_trials_cleaned_merged.csv")
 
 
9
 
10
- def parse_age(age_str):
11
- if pd.isnull(age_str):
12
- return None
13
- parts = str(age_str).split()
14
- try:
15
- return int(parts[0])
16
- except:
17
- return None
18
 
19
- df_full["MinAgeNum"] = df_full["MinimumAge"].apply(parse_age)
20
- df_full["MaxAgeNum"] = df_full["MaximumAge"].apply(parse_age)
21
- df_full["combined_text"] = df_full.astype(str).agg(" ".join, axis=1).str.lower()
22
- print(f"Preprocessed {len(df_full)} US recruiting trials.")
23
 
24
- def search_trials(user_age, user_sex, user_state, user_keywords, generate_summaries=True):
25
- # Local helpers inside the function
 
26
 
27
- def split_sentences(text):
28
- # Improved sentence splitter
29
- return [s.strip() for s in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) if s.strip()]
 
 
 
 
 
 
30
 
31
- def build_input_text(row):
32
- text_parts = [
33
- f"Intervention Name: {row.get('InterventionName', '')}",
34
- f"Intervention Description: {row.get('InterventionDescription', '')}",
35
- f"Brief Summary: {row.get('BriefSummary', '')}",
36
- f"Primary Outcome Measure: {row.get('PrimaryOutcomeMeasure', '')}",
37
- f"Primary Outcome Description: {row.get('PrimaryOutcomeDescription', '')}",
38
- f"Start Date: {row.get('StartDate', '')}",
39
- f"Detailed Description: {row.get('DetailedDescription', '')}",
40
- f"Eligibility Criteria: {row.get('EligibilityCriteria', '')}"
41
- ]
42
- return " ".join([part for part in text_parts if part.strip()])
43
 
44
- def generate_summary(row, max_sentences=7, min_sentence_length=5):
45
- text = build_input_text(row)
46
- if not text.strip():
47
- return ""
48
-
49
- sentences = split_sentences(text)
50
- # Filter out very short sentences
51
- sentences = [s for s in sentences if len(s.split()) >= min_sentence_length]
52
- if not sentences:
53
- return ""
54
-
55
- if len(sentences) <= max_sentences:
56
- return " ".join(sentences)
57
-
58
- vectorizer = TfidfVectorizer(stop_words="english")
59
- tfidf_matrix = vectorizer.fit_transform(sentences)
60
- scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
61
-
62
- # Position weighting: earlier sentences weighted higher
63
- position_weights = np.linspace(1.5, 1.0, num=len(sentences))
64
- combined_scores = scores * position_weights
65
-
66
- top_indices = combined_scores.argsort()[-max_sentences:][::-1]
67
- top_indices = sorted(top_indices) # keep original order
68
-
69
- summary_sentences = []
70
- for i in top_indices:
71
- s = sentences[i]
72
- # Skip sentences that look like metadata labels
73
- if re.match(r"^(Start Date|Primary Completion Date|Intervention Name|Primary Outcome Measure|Primary Outcome Description):", s):
74
- continue
75
- summary_sentences.append(s)
76
-
77
- # If filtered too aggressively, add back more sentences from top indices
78
- if len(summary_sentences) < max_sentences:
79
- for i in top_indices:
80
- if len(summary_sentences) >= max_sentences:
81
- break
82
- if sentences[i] not in summary_sentences:
83
- summary_sentences.append(sentences[i])
84
-
85
- return " ".join(summary_sentences[:max_sentences])
86
-
87
- df = df_full.copy()
88
-
89
- # Prepare keywords list
90
  if isinstance(user_keywords, str):
91
  keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
92
  elif isinstance(user_keywords, list):
@@ -94,22 +37,23 @@ def search_trials(user_age, user_sex, user_state, user_keywords, generate_summar
94
  else:
95
  keywords = []
96
 
 
97
  sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
98
  age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
99
  state_mask = df["LocationState"].str.lower() == str(user_state).lower()
100
 
101
  if keywords:
102
- keyword_mask = df["combined_text"].apply(lambda txt: any(k in txt for k in keywords))
 
 
 
103
  else:
104
  keyword_mask = True
105
 
 
106
  filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
107
- filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum", "combined_text"], errors="ignore")
108
 
109
- if generate_summaries and len(filtered_df) > 0:
110
- print(f"Generating improved fast extractive summaries for {len(filtered_df)} filtered trials...")
111
- filtered_df["LaymanSummary"] = filtered_df.apply(generate_summary, axis=1)
112
- else:
113
- filtered_df["LaymanSummary"] = ""
114
 
115
  return filtered_df
 
1
  import pandas as pd
 
 
 
2
 
3
+ def search_trials(user_age, user_sex, user_state, user_keywords, csv_path="clinical_trials_cleaned_merged.csv"):
4
+ """
5
+ Search for recruiting US clinical trials matching the user's demographics & optional keywords.
6
+ Returns ALL available columns from the dataset.
7
+ """
8
 
9
+ # === Load dataset ===
10
+ df = pd.read_csv(csv_path)
 
 
 
 
 
 
11
 
12
+ # Drop missing critical columns
13
+ df = df.dropna(subset=["MinimumAge", "MaximumAge", "Sex", "OverallStatus"])
 
 
14
 
15
+ # Keep only US & recruiting trials
16
+ df = df[df["LocationCountry"] == "United States"]
17
+ df = df[df["OverallStatus"].str.lower() == "recruiting"]
18
 
19
+ # Convert ages to numeric
20
+ def parse_age(age_str):
21
+ if pd.isnull(age_str):
22
+ return None
23
+ parts = str(age_str).split()
24
+ try:
25
+ return int(parts[0])
26
+ except:
27
+ return None
28
 
29
+ df["MinAgeNum"] = df["MinimumAge"].apply(parse_age)
30
+ df["MaxAgeNum"] = df["MaximumAge"].apply(parse_age)
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Prepare user's keywords list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if isinstance(user_keywords, str):
34
  keywords = [k.strip().lower() for k in user_keywords.split(",") if k.strip()]
35
  elif isinstance(user_keywords, list):
 
37
  else:
38
  keywords = []
39
 
40
+ # === Create masks ===
41
  sex_mask = df["Sex"].str.lower().isin([str(user_sex).lower(), "all"])
42
  age_mask = (df["MinAgeNum"] <= int(user_age)) & (df["MaxAgeNum"] >= int(user_age))
43
  state_mask = df["LocationState"].str.lower() == str(user_state).lower()
44
 
45
  if keywords:
46
+ def row_matches_any_keyword(row):
47
+ row_as_str = " ".join(str(x).lower() for x in row.values if pd.notnull(x))
48
+ return any(k in row_as_str for k in keywords)
49
+ keyword_mask = df.apply(row_matches_any_keyword, axis=1)
50
  else:
51
  keyword_mask = True
52
 
53
+ # Apply all filters and return ALL columns
54
  filtered_df = df[sex_mask & age_mask & state_mask & keyword_mask].reset_index(drop=True)
 
55
 
56
+ # Drop helper numeric age cols if you don’t want them visible
57
+ filtered_df = filtered_df.drop(columns=["MinAgeNum", "MaxAgeNum"], errors="ignore")
 
 
 
58
 
59
  return filtered_df
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  gradio
2
  pandas
3
  requests
4
- scikit-learn
5
- numpy
 
1
  gradio
2
  pandas
3
  requests