Spaces:
Runtime error
Runtime error
update
Browse files
app.py
CHANGED
|
@@ -18,33 +18,6 @@ from datetime import timezone # Ensure timezone is imported
|
|
| 18 |
|
| 19 |
api = HfApi()
|
| 20 |
|
| 21 |
-
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
| 22 |
-
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
| 23 |
-
|
| 24 |
-
# Removed ragatouille and abstract_retriever initialization
|
| 25 |
-
# If INDEX_REPO_ID is not used elsewhere, consider removing related lines
|
| 26 |
-
|
| 27 |
-
# Removed abstract_retriever initialization and search
|
| 28 |
-
|
| 29 |
-
def update_abstract_index() -> None:
|
| 30 |
-
"""
|
| 31 |
-
Removed abstract_retriever update functionality since ragatouille is no longer used.
|
| 32 |
-
"""
|
| 33 |
-
pass # No operation needed
|
| 34 |
-
|
| 35 |
-
# Scheduler for updating abstract index every hour
|
| 36 |
-
# Removed scheduler_abstract as it's no longer necessary
|
| 37 |
-
# If INDEX_REPO_ID is not used elsewhere, consider removing the download
|
| 38 |
-
|
| 39 |
-
# Optionally, remove the snapshot_download if the index is not needed
|
| 40 |
-
# api.snapshot_download(
|
| 41 |
-
# repo_id=INDEX_REPO_ID,
|
| 42 |
-
# repo_type="dataset",
|
| 43 |
-
# local_dir=INDEX_DIR_PATH,
|
| 44 |
-
# )
|
| 45 |
-
|
| 46 |
-
# --- DataFrame Preparation ---
|
| 47 |
-
|
| 48 |
def get_df() -> pd.DataFrame:
|
| 49 |
# Load and merge datasets
|
| 50 |
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
|
@@ -58,7 +31,7 @@ def get_df() -> pd.DataFrame:
|
|
| 58 |
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
| 59 |
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 60 |
|
| 61 |
-
# Prepare the DataFrame by removing 'abstract'
|
| 62 |
paper_info = []
|
| 63 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
| 64 |
info = row.copy()
|
|
@@ -87,16 +60,13 @@ class Prettifier:
|
|
| 87 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 88 |
new_rows = []
|
| 89 |
for _, row in df.iterrows():
|
| 90 |
-
# Handle
|
| 91 |
-
published_at = row["date"] # Already formatted as "%Y-%m-%d"
|
| 92 |
-
|
| 93 |
-
# Handle date link
|
| 94 |
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
| 95 |
|
| 96 |
new_row = {
|
| 97 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
| 98 |
"date_display": date_display, # For display
|
| 99 |
-
"
|
| 100 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
| 101 |
"title": row["title"],
|
| 102 |
"github": Prettifier.get_github_link(row.get("github", "")),
|
|
@@ -111,7 +81,7 @@ class PaperList:
|
|
| 111 |
COLUMN_INFO = [
|
| 112 |
["arxiv_id", "str"], # Added arxiv_id
|
| 113 |
["date_display", "markdown"],# For display
|
| 114 |
-
["
|
| 115 |
["paper_page", "markdown"],
|
| 116 |
["title", "str"],
|
| 117 |
["github", "markdown"],
|
|
@@ -169,9 +139,9 @@ class PaperManager:
|
|
| 169 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 170 |
"""
|
| 171 |
upvotes = row.get('๐', 0)
|
| 172 |
-
|
| 173 |
try:
|
| 174 |
-
published_time = datetime.datetime.strptime(
|
| 175 |
except ValueError:
|
| 176 |
# If parsing fails, use current time to minimize the impact on sorting
|
| 177 |
published_time = datetime.datetime.now(timezone.utc)
|
|
@@ -194,7 +164,7 @@ class PaperManager:
|
|
| 194 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 195 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 196 |
elif self.sort_method == "new":
|
| 197 |
-
df_sorted = df.sort_values(by='
|
| 198 |
else:
|
| 199 |
df_sorted = df
|
| 200 |
|
|
@@ -238,9 +208,9 @@ class PaperManager:
|
|
| 238 |
url = f"https://huggingface.co/papers/{paper_id}"
|
| 239 |
upvotes = row.get('๐', 0)
|
| 240 |
comments = row.get('๐ฌ', 0)
|
| 241 |
-
|
| 242 |
try:
|
| 243 |
-
published_time = datetime.datetime.strptime(
|
| 244 |
except ValueError:
|
| 245 |
published_time = datetime.datetime.now(timezone.utc)
|
| 246 |
time_diff = datetime.datetime.now(timezone.utc) - published_time
|
|
@@ -574,14 +544,6 @@ with demo:
|
|
| 574 |
outputs=[paper_list]
|
| 575 |
)
|
| 576 |
|
| 577 |
-
# Footer
|
| 578 |
-
gr.Markdown("""
|
| 579 |
-
Related useful Spaces:
|
| 580 |
-
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
|
| 581 |
-
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
|
| 582 |
-
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
| 583 |
-
""")
|
| 584 |
-
|
| 585 |
|
| 586 |
# --- Launch the App ---
|
| 587 |
|
|
|
|
| 18 |
|
| 19 |
api = HfApi()
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def get_df() -> pd.DataFrame:
|
| 22 |
# Load and merge datasets
|
| 23 |
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
|
|
|
| 31 |
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
| 32 |
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 33 |
|
| 34 |
+
# Prepare the DataFrame by removing 'abstract'
|
| 35 |
paper_info = []
|
| 36 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
| 37 |
info = row.copy()
|
|
|
|
| 60 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 61 |
new_rows = []
|
| 62 |
for _, row in df.iterrows():
|
| 63 |
+
# Handle date_display as a clickable link
|
|
|
|
|
|
|
|
|
|
| 64 |
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
| 65 |
|
| 66 |
new_row = {
|
| 67 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
| 68 |
"date_display": date_display, # For display
|
| 69 |
+
"date": row["date"], # For internal calculations
|
| 70 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
| 71 |
"title": row["title"],
|
| 72 |
"github": Prettifier.get_github_link(row.get("github", "")),
|
|
|
|
| 81 |
COLUMN_INFO = [
|
| 82 |
["arxiv_id", "str"], # Added arxiv_id
|
| 83 |
["date_display", "markdown"],# For display
|
| 84 |
+
["date", "str"], # For internal use
|
| 85 |
["paper_page", "markdown"],
|
| 86 |
["title", "str"],
|
| 87 |
["github", "markdown"],
|
|
|
|
| 139 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
| 140 |
"""
|
| 141 |
upvotes = row.get('๐', 0)
|
| 142 |
+
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 143 |
try:
|
| 144 |
+
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 145 |
except ValueError:
|
| 146 |
# If parsing fails, use current time to minimize the impact on sorting
|
| 147 |
published_time = datetime.datetime.now(timezone.utc)
|
|
|
|
| 164 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
| 165 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
| 166 |
elif self.sort_method == "new":
|
| 167 |
+
df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' instead of 'published_at'
|
| 168 |
else:
|
| 169 |
df_sorted = df
|
| 170 |
|
|
|
|
| 208 |
url = f"https://huggingface.co/papers/{paper_id}"
|
| 209 |
upvotes = row.get('๐', 0)
|
| 210 |
comments = row.get('๐ฌ', 0)
|
| 211 |
+
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
| 212 |
try:
|
| 213 |
+
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
| 214 |
except ValueError:
|
| 215 |
published_time = datetime.datetime.now(timezone.utc)
|
| 216 |
time_diff = datetime.datetime.now(timezone.utc) - published_time
|
|
|
|
| 544 |
outputs=[paper_list]
|
| 545 |
)
|
| 546 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
|
| 548 |
# --- Launch the App ---
|
| 549 |
|