Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -181,7 +181,7 @@ def improve_summary_generation(text, model, tokenizer):
|
|
| 181 |
"length_penalty": 1.5,
|
| 182 |
"no_repeat_ngram_size": 3,
|
| 183 |
"temperature": 0.7,
|
| 184 |
-
"repetition_penalty": 1.5
|
| 185 |
}
|
| 186 |
)
|
| 187 |
|
|
@@ -214,12 +214,6 @@ def improve_summary_generation(text, model, tokenizer):
|
|
| 214 |
|
| 215 |
def validate_summary(summary, original_text):
|
| 216 |
"""Validate summary content against original text"""
|
| 217 |
-
import re
|
| 218 |
-
|
| 219 |
-
# Don't validate empty summaries
|
| 220 |
-
if not summary or not original_text:
|
| 221 |
-
return False
|
| 222 |
-
|
| 223 |
# Check for age inconsistencies
|
| 224 |
age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
|
| 225 |
if len(age_mentions) > 1: # Multiple age mentions
|
|
@@ -237,72 +231,8 @@ def validate_summary(summary, original_text):
|
|
| 237 |
if summary_words < 20 or summary_words > original_words * 0.8:
|
| 238 |
return False
|
| 239 |
|
| 240 |
-
# Check for common error patterns
|
| 241 |
-
error_patterns = [
|
| 242 |
-
r'mean.*mean',
|
| 243 |
-
r'median.*median',
|
| 244 |
-
r'results.*results',
|
| 245 |
-
r'conclusion.*conclusion',
|
| 246 |
-
r'significance.*significance'
|
| 247 |
-
]
|
| 248 |
-
|
| 249 |
-
for pattern in error_patterns:
|
| 250 |
-
if len(re.findall(pattern, summary.lower())) > 1:
|
| 251 |
-
return False
|
| 252 |
-
|
| 253 |
return True
|
| 254 |
|
| 255 |
-
def post_process_summary(summary):
|
| 256 |
-
"""Enhanced post-processing to catch common errors"""
|
| 257 |
-
if not summary:
|
| 258 |
-
return summary
|
| 259 |
-
|
| 260 |
-
# Remove contradictory age statements
|
| 261 |
-
age_statements = []
|
| 262 |
-
lines = summary.split('.')
|
| 263 |
-
cleaned_lines = []
|
| 264 |
-
for line in lines:
|
| 265 |
-
if "age" not in line.lower():
|
| 266 |
-
cleaned_lines.append(line)
|
| 267 |
-
elif not age_statements: # Only keep first age statement
|
| 268 |
-
age_statements.append(line)
|
| 269 |
-
cleaned_lines.append(line)
|
| 270 |
-
|
| 271 |
-
# Remove redundant statements
|
| 272 |
-
seen_content = set()
|
| 273 |
-
unique_lines = []
|
| 274 |
-
for line in cleaned_lines:
|
| 275 |
-
# Skip empty lines
|
| 276 |
-
if not line.strip():
|
| 277 |
-
continue
|
| 278 |
-
|
| 279 |
-
# Normalize for comparison
|
| 280 |
-
line_core = ' '.join(sorted(line.lower().split()))
|
| 281 |
-
|
| 282 |
-
# Check for near-duplicates
|
| 283 |
-
duplicate = False
|
| 284 |
-
for seen in seen_content:
|
| 285 |
-
if line_core in seen or seen in line_core:
|
| 286 |
-
duplicate = True
|
| 287 |
-
break
|
| 288 |
-
|
| 289 |
-
if not duplicate:
|
| 290 |
-
seen_content.add(line_core)
|
| 291 |
-
unique_lines.append(line)
|
| 292 |
-
|
| 293 |
-
# Join sentences with proper spacing and punctuation
|
| 294 |
-
cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
|
| 295 |
-
if cleaned_summary and not cleaned_summary.endswith('.'):
|
| 296 |
-
cleaned_summary += '.'
|
| 297 |
-
|
| 298 |
-
# Additional cleaning
|
| 299 |
-
cleaned_summary = cleaned_summary.replace(" and and ", " and ")
|
| 300 |
-
cleaned_summary = cleaned_summary.replace("results showed", "")
|
| 301 |
-
cleaned_summary = cleaned_summary.replace("results indicated", "")
|
| 302 |
-
cleaned_summary = cleaned_summary.replace(" ", " ")
|
| 303 |
-
|
| 304 |
-
return cleaned_summary
|
| 305 |
-
|
| 306 |
def generate_focused_summary(question, abstracts, model, tokenizer):
|
| 307 |
"""Generate focused summary based on question"""
|
| 308 |
# Preprocess each abstract
|
|
@@ -327,22 +257,63 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
|
|
| 327 |
|
| 328 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
|
|
|
| 346 |
|
| 347 |
def main():
|
| 348 |
st.title("🔬 Biomedical Papers Analysis")
|
|
@@ -391,46 +362,59 @@ def main():
|
|
| 391 |
if st.session_state.summaries is None:
|
| 392 |
try:
|
| 393 |
with st.spinner("Generating individual paper summaries..."):
|
| 394 |
-
# Load summarization model
|
| 395 |
model, tokenizer = load_model("summarize")
|
| 396 |
-
|
| 397 |
-
# Generate summaries for each abstract
|
| 398 |
summaries = []
|
| 399 |
progress_bar = st.progress(0)
|
| 400 |
|
| 401 |
for idx, abstract in enumerate(df['Abstract']):
|
| 402 |
-
# Replace this line
|
| 403 |
-
# summary = generate_summary(abstract, model, tokenizer)
|
| 404 |
-
# With this line
|
| 405 |
summary = improve_summary_generation(abstract, model, tokenizer)
|
| 406 |
summaries.append(summary)
|
| 407 |
progress_bar.progress((idx + 1) / len(df))
|
| 408 |
|
| 409 |
-
# Store summaries in session state
|
| 410 |
st.session_state.summaries = summaries
|
| 411 |
-
|
| 412 |
-
# Cleanup
|
| 413 |
cleanup_model(model, tokenizer)
|
| 414 |
progress_bar.empty()
|
| 415 |
|
| 416 |
except Exception as e:
|
| 417 |
st.error(f"Error generating summaries: {str(e)}")
|
| 418 |
-
st.session_state.processing_started = False
|
| 419 |
|
| 420 |
-
# Display summaries with improved sorting
|
| 421 |
if st.session_state.summaries is not None:
|
| 422 |
col1, col2 = st.columns(2)
|
| 423 |
with col1:
|
| 424 |
sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
|
| 425 |
-
sort_column = st.selectbox("Sort by:", sort_options)
|
| 426 |
with col2:
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
-
# Create display dataframe
|
| 430 |
display_df = df.copy()
|
| 431 |
display_df['Summary'] = st.session_state.summaries
|
| 432 |
display_df['Publication Year'] = display_df['Publication Year'].astype(int)
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
# Apply custom styling
|
| 436 |
st.markdown("""
|
|
@@ -463,7 +447,7 @@ def main():
|
|
| 463 |
</style>
|
| 464 |
""", unsafe_allow_html=True)
|
| 465 |
|
| 466 |
-
# Display papers
|
| 467 |
for _, row in sorted_df.iterrows():
|
| 468 |
paper_info_cols = st.columns([1, 1])
|
| 469 |
|
|
@@ -489,7 +473,7 @@ def main():
|
|
| 489 |
</div>
|
| 490 |
""", unsafe_allow_html=True)
|
| 491 |
|
| 492 |
-
# Add
|
| 493 |
st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
|
| 494 |
|
| 495 |
# Question-focused Summary Section (only if question provided)
|
|
|
|
| 181 |
"length_penalty": 1.5,
|
| 182 |
"no_repeat_ngram_size": 3,
|
| 183 |
"temperature": 0.7,
|
| 184 |
+
"repetition_penalty": 1.5
|
| 185 |
}
|
| 186 |
)
|
| 187 |
|
|
|
|
| 214 |
|
| 215 |
def validate_summary(summary, original_text):
|
| 216 |
"""Validate summary content against original text"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
# Check for age inconsistencies
|
| 218 |
age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
|
| 219 |
if len(age_mentions) > 1: # Multiple age mentions
|
|
|
|
| 231 |
if summary_words < 20 or summary_words > original_words * 0.8:
|
| 232 |
return False
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
return True
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
def generate_focused_summary(question, abstracts, model, tokenizer):
|
| 237 |
"""Generate focused summary based on question"""
|
| 238 |
# Preprocess each abstract
|
|
|
|
| 257 |
|
| 258 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
| 259 |
|
| 260 |
+
def create_filter_controls(df, sort_column):
|
| 261 |
+
"""Create appropriate filter controls based on the selected column"""
|
| 262 |
+
filtered_df = df.copy()
|
| 263 |
+
|
| 264 |
+
if sort_column == 'Publication Year':
|
| 265 |
+
# Year range slider
|
| 266 |
+
year_min = int(df['Publication Year'].min())
|
| 267 |
+
year_max = int(df['Publication Year'].max())
|
| 268 |
+
col1, col2 = st.columns(2)
|
| 269 |
+
with col1:
|
| 270 |
+
start_year = st.number_input('From Year',
|
| 271 |
+
min_value=year_min,
|
| 272 |
+
max_value=year_max,
|
| 273 |
+
value=year_min)
|
| 274 |
+
with col2:
|
| 275 |
+
end_year = st.number_input('To Year',
|
| 276 |
+
min_value=year_min,
|
| 277 |
+
max_value=year_max,
|
| 278 |
+
value=year_max)
|
| 279 |
+
filtered_df = filtered_df[
|
| 280 |
+
(filtered_df['Publication Year'] >= start_year) &
|
| 281 |
+
(filtered_df['Publication Year'] <= end_year)
|
| 282 |
+
]
|
| 283 |
+
|
| 284 |
+
elif sort_column == 'Authors':
|
| 285 |
+
# Multi-select for authors
|
| 286 |
+
unique_authors = sorted(set(
|
| 287 |
+
author.strip()
|
| 288 |
+
for authors in df['Authors'].dropna()
|
| 289 |
+
for author in authors.split(';')
|
| 290 |
+
))
|
| 291 |
+
selected_authors = st.multiselect(
|
| 292 |
+
'Select Authors',
|
| 293 |
+
unique_authors
|
| 294 |
+
)
|
| 295 |
+
if selected_authors:
|
| 296 |
+
filtered_df = filtered_df[
|
| 297 |
+
filtered_df['Authors'].apply(
|
| 298 |
+
lambda x: any(author in str(x) for author in selected_authors)
|
| 299 |
+
)
|
| 300 |
+
]
|
| 301 |
+
|
| 302 |
+
elif sort_column == 'Source Title':
|
| 303 |
+
# Multi-select for source titles
|
| 304 |
+
unique_sources = sorted(df['Source Title'].unique())
|
| 305 |
+
selected_sources = st.multiselect(
|
| 306 |
+
'Select Sources',
|
| 307 |
+
unique_sources
|
| 308 |
+
)
|
| 309 |
+
if selected_sources:
|
| 310 |
+
filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
|
| 311 |
+
|
| 312 |
+
elif sort_column == 'Article Title':
|
| 313 |
+
# Only alphabetical sorting, no filtering
|
| 314 |
+
pass
|
| 315 |
|
| 316 |
+
return filtered_df
|
| 317 |
|
| 318 |
def main():
|
| 319 |
st.title("🔬 Biomedical Papers Analysis")
|
|
|
|
| 362 |
if st.session_state.summaries is None:
|
| 363 |
try:
|
| 364 |
with st.spinner("Generating individual paper summaries..."):
|
|
|
|
| 365 |
model, tokenizer = load_model("summarize")
|
|
|
|
|
|
|
| 366 |
summaries = []
|
| 367 |
progress_bar = st.progress(0)
|
| 368 |
|
| 369 |
for idx, abstract in enumerate(df['Abstract']):
|
|
|
|
|
|
|
|
|
|
| 370 |
summary = improve_summary_generation(abstract, model, tokenizer)
|
| 371 |
summaries.append(summary)
|
| 372 |
progress_bar.progress((idx + 1) / len(df))
|
| 373 |
|
|
|
|
| 374 |
st.session_state.summaries = summaries
|
|
|
|
|
|
|
| 375 |
cleanup_model(model, tokenizer)
|
| 376 |
progress_bar.empty()
|
| 377 |
|
| 378 |
except Exception as e:
|
| 379 |
st.error(f"Error generating summaries: {str(e)}")
|
| 380 |
+
st.session_state.processing_started = False
|
| 381 |
|
| 382 |
+
# Display summaries with improved sorting and filtering
|
| 383 |
if st.session_state.summaries is not None:
|
| 384 |
col1, col2 = st.columns(2)
|
| 385 |
with col1:
|
| 386 |
sort_options = ['Article Title', 'Authors', 'Publication Year', 'Source Title']
|
| 387 |
+
sort_column = st.selectbox("Sort/Filter by:", sort_options)
|
| 388 |
with col2:
|
| 389 |
+
# Only show A-Z/Z-A option for Article Title
|
| 390 |
+
if sort_column == 'Article Title':
|
| 391 |
+
ascending = st.radio(
|
| 392 |
+
"Sort order",
|
| 393 |
+
["A to Z", "Z to A"],
|
| 394 |
+
horizontal=True
|
| 395 |
+
) == "A to Z"
|
| 396 |
+
else:
|
| 397 |
+
ascending = True # Default for other columns
|
| 398 |
|
| 399 |
+
# Create display dataframe
|
| 400 |
display_df = df.copy()
|
| 401 |
display_df['Summary'] = st.session_state.summaries
|
| 402 |
display_df['Publication Year'] = display_df['Publication Year'].astype(int)
|
| 403 |
+
|
| 404 |
+
# Apply filters
|
| 405 |
+
filtered_df = create_filter_controls(display_df, sort_column)
|
| 406 |
+
|
| 407 |
+
if sort_column == 'Article Title':
|
| 408 |
+
# Sort alphabetically
|
| 409 |
+
sorted_df = filtered_df.sort_values(by=sort_column, ascending=ascending)
|
| 410 |
+
else:
|
| 411 |
+
# Keep original order for other columns after filtering
|
| 412 |
+
# Keep original order for other columns after filtering
|
| 413 |
+
sorted_df = filtered_df
|
| 414 |
+
|
| 415 |
+
# Show number of filtered results
|
| 416 |
+
if len(sorted_df) != len(display_df):
|
| 417 |
+
st.write(f"Showing {len(sorted_df)} of {len(display_df)} papers")
|
| 418 |
|
| 419 |
# Apply custom styling
|
| 420 |
st.markdown("""
|
|
|
|
| 447 |
</style>
|
| 448 |
""", unsafe_allow_html=True)
|
| 449 |
|
| 450 |
+
# Display papers using the filtered and sorted dataframe
|
| 451 |
for _, row in sorted_df.iterrows():
|
| 452 |
paper_info_cols = st.columns([1, 1])
|
| 453 |
|
|
|
|
| 473 |
</div>
|
| 474 |
""", unsafe_allow_html=True)
|
| 475 |
|
| 476 |
+
# Add spacing between papers
|
| 477 |
st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
|
| 478 |
|
| 479 |
# Question-focused Summary Section (only if question provided)
|