Commit
·
b935477
1
Parent(s):
efc2e3f
app.py
CHANGED
|
@@ -55,7 +55,8 @@ ALLOWED_DOMAINS = {
|
|
| 55 |
"arxiv.org",
|
| 56 |
"huggingface.co",
|
| 57 |
"github.com",
|
| 58 |
-
"github.io"
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
if not HF_TOKEN:
|
|
@@ -211,7 +212,14 @@ def extract_links_from_soup(soup, text):
|
|
| 211 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
| 212 |
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
|
| 213 |
markdown_links = link_pattern.findall(text)
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
|
@@ -369,22 +377,39 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 369 |
logger.debug(f"Failed to scrape project page: {e}")
|
| 370 |
|
| 371 |
# Try GitHub README parsing
|
| 372 |
-
if row_data.get("Code") is not None and
|
| 373 |
try:
|
| 374 |
repo = row_data["Code"].split("github.com/")[1]
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
if
|
| 378 |
-
|
| 379 |
-
if
|
| 380 |
-
|
| 381 |
-
if
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
except Exception:
|
| 389 |
pass
|
| 390 |
|
|
@@ -717,7 +742,7 @@ def infer_authors(input_data: str) -> List[str]:
|
|
| 717 |
Examples:
|
| 718 |
- "https://arxiv.org/abs/2103.00020"
|
| 719 |
- "https://huggingface.co/papers/2103.00020"
|
| 720 |
-
- "CLIP
|
| 721 |
|
| 722 |
Returns:
|
| 723 |
List[str]: A list of author names as strings, or empty list if no authors found.
|
|
@@ -1192,10 +1217,10 @@ with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
|
| 1192 |
with gr.Row():
|
| 1193 |
with gr.Column():
|
| 1194 |
input_text = gr.Textbox(
|
| 1195 |
-
label="Demo Input
|
| 1196 |
placeholder="https://arxiv.org/abs/2506.18787",
|
| 1197 |
lines=2,
|
| 1198 |
-
info="
|
| 1199 |
)
|
| 1200 |
submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
|
| 1201 |
|
|
|
|
| 55 |
"arxiv.org",
|
| 56 |
"huggingface.co",
|
| 57 |
"github.com",
|
| 58 |
+
"github.io",
|
| 59 |
+
"raw.githubusercontent.com"
|
| 60 |
}
|
| 61 |
|
| 62 |
if not HF_TOKEN:
|
|
|
|
| 212 |
html_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
|
| 213 |
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
|
| 214 |
markdown_links = link_pattern.findall(text)
|
| 215 |
+
|
| 216 |
+
# Also extract direct URLs that aren't in markdown format
|
| 217 |
+
url_pattern = re.compile(r'https?://[^\s\)]+')
|
| 218 |
+
direct_urls = url_pattern.findall(text)
|
| 219 |
+
|
| 220 |
+
# Combine all links and remove duplicates
|
| 221 |
+
all_links = html_links + markdown_links + direct_urls
|
| 222 |
+
return list(set(all_links))
|
| 223 |
|
| 224 |
|
| 225 |
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
|
|
|
| 377 |
logger.debug(f"Failed to scrape project page: {e}")
|
| 378 |
|
| 379 |
# Try GitHub README parsing
|
| 380 |
+
if row_data.get("Code") is not None and "github.com" in row_data["Code"]:
|
| 381 |
try:
|
| 382 |
repo = row_data["Code"].split("github.com/")[1]
|
| 383 |
+
|
| 384 |
+
# First try with GitHub API if available
|
| 385 |
+
if GITHUB_AUTH:
|
| 386 |
+
readme_response = make_github_request(f"/repos/{repo}/readme")
|
| 387 |
+
if readme_response:
|
| 388 |
+
readme = readme_response.json()
|
| 389 |
+
if readme.get("type") == "file" and readme.get("download_url"):
|
| 390 |
+
response = cached_request(readme["download_url"])
|
| 391 |
+
if response:
|
| 392 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 393 |
+
links = extract_links_from_soup(soup, response.text)
|
| 394 |
+
for link in links:
|
| 395 |
+
if link and ("arxiv" in link or "huggingface.co/papers" in link):
|
| 396 |
+
logger.info(f"Paper {link} inferred from Code (via GitHub API)")
|
| 397 |
+
return link
|
| 398 |
+
|
| 399 |
+
# Fallback: try scraping the GitHub page directly
|
| 400 |
+
try:
|
| 401 |
+
github_url = row_data["Code"]
|
| 402 |
+
response = cached_request(github_url)
|
| 403 |
+
if response:
|
| 404 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 405 |
+
links = extract_links_from_soup(soup, response.text)
|
| 406 |
+
for link in links:
|
| 407 |
+
if link and ("arxiv" in link or "huggingface.co/papers" in link):
|
| 408 |
+
logger.info(f"Paper {link} inferred from Code (via GitHub scraping)")
|
| 409 |
+
return link
|
| 410 |
+
except (ValidationError, ExternalAPIError):
|
| 411 |
+
pass
|
| 412 |
+
|
| 413 |
except Exception:
|
| 414 |
pass
|
| 415 |
|
|
|
|
| 742 |
Examples:
|
| 743 |
- "https://arxiv.org/abs/2103.00020"
|
| 744 |
- "https://huggingface.co/papers/2103.00020"
|
| 745 |
+
- "https://github.com/openai/CLIP"
|
| 746 |
|
| 747 |
Returns:
|
| 748 |
List[str]: A list of author names as strings, or empty list if no authors found.
|
|
|
|
| 1217 |
with gr.Row():
|
| 1218 |
with gr.Column():
|
| 1219 |
input_text = gr.Textbox(
|
| 1220 |
+
label="Demo Input",
|
| 1221 |
placeholder="https://arxiv.org/abs/2506.18787",
|
| 1222 |
lines=2,
|
| 1223 |
+
info="Paper URL, repository URL, or project page"
|
| 1224 |
)
|
| 1225 |
submit_btn = gr.Button("🔍 Demonstrate find_research_relationships", variant="primary")
|
| 1226 |
|