Spaces:
Build error
Build error
Filip Makraduli commited on
Commit Β·
ec8c22e
1
Parent(s): 0d82e4d
recency fix again
Browse files- research_ai_agent.py +31 -74
research_ai_agent.py
CHANGED
|
@@ -225,53 +225,31 @@ def setup_superlinked_minimal(df):
|
|
| 225 |
max_date = df['published'].max()
|
| 226 |
print(f"π
Dataset date range: {min_date} to {max_date}")
|
| 227 |
|
| 228 |
-
# DYNAMIC RECENCY: Calculate periods based on actual data, not hardcoded dates
|
| 229 |
-
# This will work regardless of when the code is run
|
| 230 |
-
|
| 231 |
-
# Calculate the age range in the dataset
|
| 232 |
-
current_time = pd.Timestamp.now()
|
| 233 |
-
df['age_days'] = (current_time - df['published']).dt.days
|
| 234 |
-
|
| 235 |
-
min_age = df['age_days'].min()
|
| 236 |
-
max_age = df['age_days'].max()
|
| 237 |
-
|
| 238 |
-
print(f"π Dataset age range: {min_age} to {max_age} days old")
|
| 239 |
-
print(f"π That's {min_age/365:.1f} to {max_age/365:.1f} years old")
|
| 240 |
-
|
| 241 |
-
# Find the age difference between newest and oldest papers
|
| 242 |
-
age_span = max_age - min_age
|
| 243 |
-
|
| 244 |
-
print(f"π Age span in dataset: {age_span} days ({age_span/365:.2f} years)")
|
| 245 |
-
|
| 246 |
-
# Create periods that give graduated scoring based on relative age within the dataset
|
| 247 |
-
# Newer papers (closer to min_age) get more points
|
| 248 |
-
period_1 = min_age + (age_span * 0.25) # Newest 25% of papers
|
| 249 |
-
period_2 = min_age + (age_span * 0.50) # Newest 50% of papers
|
| 250 |
-
period_3 = min_age + (age_span * 0.75) # Newest 75% of papers
|
| 251 |
-
period_4 = max_age + 365 # All papers in dataset + buffer
|
| 252 |
-
|
| 253 |
-
print(f"π Recency periods (days old):")
|
| 254 |
-
print(f" Period 1: {period_1:.0f} days ({period_1/365:.2f} years)")
|
| 255 |
-
print(f" Period 2: {period_2:.0f} days ({period_2/365:.2f} years)")
|
| 256 |
-
print(f" Period 3: {period_3:.0f} days ({period_3/365:.2f} years)")
|
| 257 |
-
print(f" Period 4: {period_4:.0f} days ({period_4/365:.2f} years)")
|
| 258 |
-
|
| 259 |
# Text similarity space
|
| 260 |
text_space = sl.TextSimilaritySpace(
|
| 261 |
text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
|
| 262 |
model="sentence-transformers/all-mpnet-base-v2"
|
| 263 |
)
|
| 264 |
|
| 265 |
-
#
|
|
|
|
| 266 |
recency_space = sl.RecencySpace(
|
| 267 |
timestamp=paper.published_unix,
|
| 268 |
period_time_list=[
|
| 269 |
-
sl.PeriodTime(timedelta(days=
|
| 270 |
-
sl.PeriodTime(timedelta(days=
|
| 271 |
-
sl.PeriodTime(timedelta(days=
|
| 272 |
-
sl.PeriodTime(timedelta(days=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
],
|
| 274 |
-
negative_filter=-
|
| 275 |
)
|
| 276 |
|
| 277 |
# Create index
|
|
@@ -353,53 +331,32 @@ def setup_superlinked_ultrafast(df):
|
|
| 353 |
max_date = df['published'].max()
|
| 354 |
print(f"π
Dataset date range: {min_date} to {max_date}")
|
| 355 |
|
| 356 |
-
# DYNAMIC RECENCY: Calculate periods based on actual data, not hardcoded dates
|
| 357 |
-
# This will work regardless of when the code is run
|
| 358 |
-
|
| 359 |
-
# Calculate the age range in the dataset
|
| 360 |
-
current_time = pd.Timestamp.now()
|
| 361 |
-
df['age_days'] = (current_time - df['published']).dt.days
|
| 362 |
-
|
| 363 |
-
min_age = df['age_days'].min()
|
| 364 |
-
max_age = df['age_days'].max()
|
| 365 |
-
|
| 366 |
-
print(f"π Dataset age range: {min_age} to {max_age} days old")
|
| 367 |
-
print(f"π That's {min_age/365:.1f} to {max_age/365:.1f} years old")
|
| 368 |
-
|
| 369 |
-
# Find the age difference between newest and oldest papers
|
| 370 |
-
age_span = max_age - min_age
|
| 371 |
-
|
| 372 |
-
print(f"π Age span in dataset: {age_span} days ({age_span/365:.2f} years)")
|
| 373 |
-
|
| 374 |
-
# Create periods that give graduated scoring based on relative age within the dataset
|
| 375 |
-
# Newer papers (closer to min_age) get more points
|
| 376 |
-
period_1 = min_age + (age_span * 0.25) # Newest 25% of papers
|
| 377 |
-
period_2 = min_age + (age_span * 0.50) # Newest 50% of papers
|
| 378 |
-
period_3 = min_age + (age_span * 0.75) # Newest 75% of papers
|
| 379 |
-
period_4 = max_age + 365 # All papers in dataset + buffer
|
| 380 |
-
|
| 381 |
-
print(f"π Recency periods (days old):")
|
| 382 |
-
print(f" Period 1: {period_1:.0f} days ({period_1/365:.2f} years)")
|
| 383 |
-
print(f" Period 2: {period_2:.0f} days ({period_2/365:.2f} years)")
|
| 384 |
-
print(f" Period 3: {period_3:.0f} days ({period_3/365:.2f} years)")
|
| 385 |
-
print(f" Period 4: {period_4:.0f} days ({period_4/365:.2f} years)")
|
| 386 |
-
|
| 387 |
# Text similarity space
|
| 388 |
text_space = sl.TextSimilaritySpace(
|
| 389 |
text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
|
| 390 |
model="sentence-transformers/all-mpnet-base-v2"
|
| 391 |
)
|
| 392 |
|
| 393 |
-
#
|
|
|
|
|
|
|
| 394 |
recency_space = sl.RecencySpace(
|
| 395 |
timestamp=paper.published_unix,
|
| 396 |
period_time_list=[
|
| 397 |
-
sl.PeriodTime(timedelta(days=
|
| 398 |
-
sl.PeriodTime(timedelta(days=
|
| 399 |
-
sl.PeriodTime(timedelta(days=
|
| 400 |
-
sl.PeriodTime(timedelta(days=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
],
|
| 402 |
-
negative_filter=-
|
| 403 |
)
|
| 404 |
|
| 405 |
# Create index with both spaces - following query_time_weights.ipynb pattern
|
|
|
|
| 225 |
max_date = df['published'].max()
|
| 226 |
print(f"π
Dataset date range: {min_date} to {max_date}")
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
# Text similarity space
|
| 229 |
text_space = sl.TextSimilaritySpace(
|
| 230 |
text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
|
| 231 |
model="sentence-transformers/all-mpnet-base-v2"
|
| 232 |
)
|
| 233 |
|
| 234 |
+
# CORRECT RECENCY: Following the official example pattern
|
| 235 |
+
# Expanded for historical dataset (1993-2025 = ~32 years)
|
| 236 |
recency_space = sl.RecencySpace(
|
| 237 |
timestamp=paper.published_unix,
|
| 238 |
period_time_list=[
|
| 239 |
+
sl.PeriodTime(timedelta(days=365)), # papers within 1 year
|
| 240 |
+
sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years
|
| 241 |
+
sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years
|
| 242 |
+
sl.PeriodTime(timedelta(days=5*365)), # papers within 5 years
|
| 243 |
+
sl.PeriodTime(timedelta(days=10*365)), # papers within 10 years
|
| 244 |
+
sl.PeriodTime(timedelta(days=15*365)), # papers within 15 years
|
| 245 |
+
sl.PeriodTime(timedelta(days=20*365)), # papers within 20 years
|
| 246 |
+
sl.PeriodTime(timedelta(days=25*365)), # papers within 25 years
|
| 247 |
+
sl.PeriodTime(timedelta(days=30*365)), # papers within 30 years
|
| 248 |
+
sl.PeriodTime(timedelta(days=31*365)), # papers within 31 years
|
| 249 |
+
sl.PeriodTime(timedelta(days=31*365 + 120)), # papers within 31.33 years (includes Feb 1994)
|
| 250 |
+
sl.PeriodTime(timedelta(days=32*365)), # papers within 32 years (includes both)
|
| 251 |
],
|
| 252 |
+
negative_filter=-0.25
|
| 253 |
)
|
| 254 |
|
| 255 |
# Create index
|
|
|
|
| 331 |
max_date = df['published'].max()
|
| 332 |
print(f"π
Dataset date range: {min_date} to {max_date}")
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# Text similarity space
|
| 335 |
text_space = sl.TextSimilaritySpace(
|
| 336 |
text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
|
| 337 |
model="sentence-transformers/all-mpnet-base-v2"
|
| 338 |
)
|
| 339 |
|
| 340 |
+
# CORRECT RECENCY: Following the official example pattern
|
| 341 |
+
# Expanded for historical dataset (1993-2025 = ~32 years)
|
| 342 |
+
# Added granular periods for 30-32 year range to differentiate 1993 vs 1994
|
| 343 |
recency_space = sl.RecencySpace(
|
| 344 |
timestamp=paper.published_unix,
|
| 345 |
period_time_list=[
|
| 346 |
+
sl.PeriodTime(timedelta(days=365)), # papers within 1 year
|
| 347 |
+
sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years
|
| 348 |
+
sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years
|
| 349 |
+
sl.PeriodTime(timedelta(days=5*365)), # papers within 5 years
|
| 350 |
+
sl.PeriodTime(timedelta(days=10*365)), # papers within 10 years
|
| 351 |
+
sl.PeriodTime(timedelta(days=15*365)), # papers within 15 years
|
| 352 |
+
sl.PeriodTime(timedelta(days=20*365)), # papers within 20 years
|
| 353 |
+
sl.PeriodTime(timedelta(days=25*365)), # papers within 25 years
|
| 354 |
+
sl.PeriodTime(timedelta(days=30*365)), # papers within 30 years
|
| 355 |
+
sl.PeriodTime(timedelta(days=31*365)), # papers within 31 years
|
| 356 |
+
sl.PeriodTime(timedelta(days=31*365 + 120)), # papers within 31.33 years (includes Feb 1994)
|
| 357 |
+
sl.PeriodTime(timedelta(days=32*365)), # papers within 32 years (includes both)
|
| 358 |
],
|
| 359 |
+
negative_filter=-0.25
|
| 360 |
)
|
| 361 |
|
| 362 |
# Create index with both spaces - following query_time_weights.ipynb pattern
|