Filip Makraduli commited on
Commit
ec8c22e
Β·
1 Parent(s): 0d82e4d

recency fix again

Browse files
Files changed (1) hide show
  1. research_ai_agent.py +31 -74
research_ai_agent.py CHANGED
@@ -225,53 +225,31 @@ def setup_superlinked_minimal(df):
225
  max_date = df['published'].max()
226
  print(f"πŸ“… Dataset date range: {min_date} to {max_date}")
227
 
228
- # DYNAMIC RECENCY: Calculate periods based on actual data, not hardcoded dates
229
- # This will work regardless of when the code is run
230
-
231
- # Calculate the age range in the dataset
232
- current_time = pd.Timestamp.now()
233
- df['age_days'] = (current_time - df['published']).dt.days
234
-
235
- min_age = df['age_days'].min()
236
- max_age = df['age_days'].max()
237
-
238
- print(f"πŸ“Š Dataset age range: {min_age} to {max_age} days old")
239
- print(f"πŸ“Š That's {min_age/365:.1f} to {max_age/365:.1f} years old")
240
-
241
- # Find the age difference between newest and oldest papers
242
- age_span = max_age - min_age
243
-
244
- print(f"πŸ“Š Age span in dataset: {age_span} days ({age_span/365:.2f} years)")
245
-
246
- # Create periods that give graduated scoring based on relative age within the dataset
247
- # Newer papers (closer to min_age) get more points
248
- period_1 = min_age + (age_span * 0.25) # Newest 25% of papers
249
- period_2 = min_age + (age_span * 0.50) # Newest 50% of papers
250
- period_3 = min_age + (age_span * 0.75) # Newest 75% of papers
251
- period_4 = max_age + 365 # All papers in dataset + buffer
252
-
253
- print(f"πŸ“Š Recency periods (days old):")
254
- print(f" Period 1: {period_1:.0f} days ({period_1/365:.2f} years)")
255
- print(f" Period 2: {period_2:.0f} days ({period_2/365:.2f} years)")
256
- print(f" Period 3: {period_3:.0f} days ({period_3/365:.2f} years)")
257
- print(f" Period 4: {period_4:.0f} days ({period_4/365:.2f} years)")
258
-
259
  # Text similarity space
260
  text_space = sl.TextSimilaritySpace(
261
  text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
262
  model="sentence-transformers/all-mpnet-base-v2"
263
  )
264
 
265
- # ADAPTIVE RECENCY: Periods calculated from actual dataset age distribution
 
266
  recency_space = sl.RecencySpace(
267
  timestamp=paper.published_unix,
268
  period_time_list=[
269
- sl.PeriodTime(timedelta(days=period_1)), # Newest 25% get extra point
270
- sl.PeriodTime(timedelta(days=period_2)), # Newest 50% get extra point
271
- sl.PeriodTime(timedelta(days=period_3)), # Newest 75% get extra point
272
- sl.PeriodTime(timedelta(days=period_4)), # All papers get base point
 
 
 
 
 
 
 
 
273
  ],
274
- negative_filter=-2.0 # Penalty for papers older than dataset range
275
  )
276
 
277
  # Create index
@@ -353,53 +331,32 @@ def setup_superlinked_ultrafast(df):
353
  max_date = df['published'].max()
354
  print(f"πŸ“… Dataset date range: {min_date} to {max_date}")
355
 
356
- # DYNAMIC RECENCY: Calculate periods based on actual data, not hardcoded dates
357
- # This will work regardless of when the code is run
358
-
359
- # Calculate the age range in the dataset
360
- current_time = pd.Timestamp.now()
361
- df['age_days'] = (current_time - df['published']).dt.days
362
-
363
- min_age = df['age_days'].min()
364
- max_age = df['age_days'].max()
365
-
366
- print(f"πŸ“Š Dataset age range: {min_age} to {max_age} days old")
367
- print(f"πŸ“Š That's {min_age/365:.1f} to {max_age/365:.1f} years old")
368
-
369
- # Find the age difference between newest and oldest papers
370
- age_span = max_age - min_age
371
-
372
- print(f"πŸ“Š Age span in dataset: {age_span} days ({age_span/365:.2f} years)")
373
-
374
- # Create periods that give graduated scoring based on relative age within the dataset
375
- # Newer papers (closer to min_age) get more points
376
- period_1 = min_age + (age_span * 0.25) # Newest 25% of papers
377
- period_2 = min_age + (age_span * 0.50) # Newest 50% of papers
378
- period_3 = min_age + (age_span * 0.75) # Newest 75% of papers
379
- period_4 = max_age + 365 # All papers in dataset + buffer
380
-
381
- print(f"πŸ“Š Recency periods (days old):")
382
- print(f" Period 1: {period_1:.0f} days ({period_1/365:.2f} years)")
383
- print(f" Period 2: {period_2:.0f} days ({period_2/365:.2f} years)")
384
- print(f" Period 3: {period_3:.0f} days ({period_3/365:.2f} years)")
385
- print(f" Period 4: {period_4:.0f} days ({period_4/365:.2f} years)")
386
-
387
  # Text similarity space
388
  text_space = sl.TextSimilaritySpace(
389
  text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
390
  model="sentence-transformers/all-mpnet-base-v2"
391
  )
392
 
393
- # ADAPTIVE RECENCY: Periods calculated from actual dataset age distribution
 
 
394
  recency_space = sl.RecencySpace(
395
  timestamp=paper.published_unix,
396
  period_time_list=[
397
- sl.PeriodTime(timedelta(days=period_1)), # Newest 25% get extra point
398
- sl.PeriodTime(timedelta(days=period_2)), # Newest 50% get extra point
399
- sl.PeriodTime(timedelta(days=period_3)), # Newest 75% get extra point
400
- sl.PeriodTime(timedelta(days=period_4)), # All papers get base point
 
 
 
 
 
 
 
 
401
  ],
402
- negative_filter=-2.0 # Penalty for papers older than dataset range
403
  )
404
 
405
  # Create index with both spaces - following query_time_weights.ipynb pattern
 
225
  max_date = df['published'].max()
226
  print(f"πŸ“… Dataset date range: {min_date} to {max_date}")
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  # Text similarity space
229
  text_space = sl.TextSimilaritySpace(
230
  text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
231
  model="sentence-transformers/all-mpnet-base-v2"
232
  )
233
 
234
+ # CORRECT RECENCY: Following the official example pattern
235
+ # Expanded for historical dataset (1993-2025 = ~32 years)
236
  recency_space = sl.RecencySpace(
237
  timestamp=paper.published_unix,
238
  period_time_list=[
239
+ sl.PeriodTime(timedelta(days=365)), # papers within 1 year
240
+ sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years
241
+ sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years
242
+ sl.PeriodTime(timedelta(days=5*365)), # papers within 5 years
243
+ sl.PeriodTime(timedelta(days=10*365)), # papers within 10 years
244
+ sl.PeriodTime(timedelta(days=15*365)), # papers within 15 years
245
+ sl.PeriodTime(timedelta(days=20*365)), # papers within 20 years
246
+ sl.PeriodTime(timedelta(days=25*365)), # papers within 25 years
247
+ sl.PeriodTime(timedelta(days=30*365)), # papers within 30 years
248
+ sl.PeriodTime(timedelta(days=31*365)), # papers within 31 years
249
+ sl.PeriodTime(timedelta(days=31*365 + 120)), # papers within 31.33 years (includes Feb 1994)
250
+ sl.PeriodTime(timedelta(days=32*365)), # papers within 32 years (includes both)
251
  ],
252
+ negative_filter=-0.25
253
  )
254
 
255
  # Create index
 
331
  max_date = df['published'].max()
332
  print(f"πŸ“… Dataset date range: {min_date} to {max_date}")
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  # Text similarity space
335
  text_space = sl.TextSimilaritySpace(
336
  text=sl.chunk(paper.text, chunk_size=1000, chunk_overlap=0),
337
  model="sentence-transformers/all-mpnet-base-v2"
338
  )
339
 
340
+ # CORRECT RECENCY: Following the official example pattern
341
+ # Expanded for historical dataset (1993-2025 = ~32 years)
342
+ # Added granular periods for 30-32 year range to differentiate 1993 vs 1994
343
  recency_space = sl.RecencySpace(
344
  timestamp=paper.published_unix,
345
  period_time_list=[
346
+ sl.PeriodTime(timedelta(days=365)), # papers within 1 year
347
+ sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years
348
+ sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years
349
+ sl.PeriodTime(timedelta(days=5*365)), # papers within 5 years
350
+ sl.PeriodTime(timedelta(days=10*365)), # papers within 10 years
351
+ sl.PeriodTime(timedelta(days=15*365)), # papers within 15 years
352
+ sl.PeriodTime(timedelta(days=20*365)), # papers within 20 years
353
+ sl.PeriodTime(timedelta(days=25*365)), # papers within 25 years
354
+ sl.PeriodTime(timedelta(days=30*365)), # papers within 30 years
355
+ sl.PeriodTime(timedelta(days=31*365)), # papers within 31 years
356
+ sl.PeriodTime(timedelta(days=31*365 + 120)), # papers within 31.33 years (includes Feb 1994)
357
+ sl.PeriodTime(timedelta(days=32*365)), # papers within 32 years (includes both)
358
  ],
359
+ negative_filter=-0.25
360
  )
361
 
362
  # Create index with both spaces - following query_time_weights.ipynb pattern