Spaces:
Sleeping
Sleeping
Update tools.py
Browse files
tools.py
CHANGED
|
@@ -8,20 +8,16 @@ from sklearn.decomposition import PCA
|
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
from sklearn.cluster import AgglomerativeClustering
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
PERSISTENT_BASE = "/data" if os.path.exists("/data") else os.path.dirname(os.path.abspath(__file__))
|
| 14 |
|
| 15 |
-
CSV_PATH = os.path.join(
|
| 16 |
-
OUT_DIR = os.path.join(
|
| 17 |
-
PDF_DIR = os.path.join(
|
| 18 |
-
SCRATCH_DIR = os.path.join(
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
for _d in [OUT_DIR, PDF_DIR, SCRATCH_DIR]:
|
| 22 |
-
os.makedirs(_d, exist_ok=True)
|
| 23 |
-
|
| 24 |
-
HEADERS = ["Sr No", "Title", "DOI", "Web Link", "Authors", "Date of Publication", "Journal", "Abstract", "No of Citations", "Keywords"]
|
| 25 |
|
| 26 |
PAJAIS_TAXONOMY = {
|
| 27 |
"AI & Machine Learning": ["deep learning","neural networks","NLP","computer vision","reinforcement learning","transformers"],
|
|
@@ -46,7 +42,7 @@ def _embed():
|
|
| 46 |
|
| 47 |
@tool
|
| 48 |
def search_academic_source(query: str, source: str) -> str:
|
| 49 |
-
"""Search academic sources. Valid sources: 'google_scholar', 'arxiv', 'tavily'."""
|
| 50 |
try:
|
| 51 |
src = source.lower()
|
| 52 |
if "google" in src or "scholar" in src:
|
|
@@ -55,25 +51,52 @@ def search_academic_source(query: str, source: str) -> str:
|
|
| 55 |
elif "arxiv" in src:
|
| 56 |
from langchain_community.utilities import ArxivAPIWrapper
|
| 57 |
return ArxivAPIWrapper(top_k_results=5).run(query)[:5000]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
else:
|
| 59 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 60 |
-
|
| 61 |
-
return str(TavilySearchResults(max_results=5).invoke(f"{query} academic paper abstract"))[:5000]
|
| 62 |
except Exception as e: return f"Error searching {source}: {str(e)}"
|
| 63 |
|
| 64 |
-
|
| 65 |
-
def download_pdf(url: str, filename: str) -> str:
|
| 66 |
-
"""Download a PDF from the given URL and save it to the pdfs/ folder. Ensure filename ends with .pdf."""
|
| 67 |
try:
|
| 68 |
r = requests.get(url, stream=True, timeout=12)
|
| 69 |
if r.status_code == 200:
|
| 70 |
if not filename.lower().endswith(".pdf"): filename += ".pdf"
|
| 71 |
-
path = os.path.join(PDF_DIR, filename)
|
| 72 |
with open(path, 'wb') as f:
|
| 73 |
for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
|
| 74 |
-
return
|
| 75 |
-
|
| 76 |
-
|
| 77 |
|
| 78 |
@tool
|
| 79 |
def save_papers(papers_json: str) -> str:
|
|
@@ -92,8 +115,15 @@ def save_papers(papers_json: str) -> str:
|
|
| 92 |
impact_words = ["societ", "ethic", "bias", "fairness", "privacy", "impact", "policy", "govern", "regulation", "human"]
|
| 93 |
return any(w in txt for w in ai_words) and any(w in txt for w in impact_words)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# Enforce MAX_TOTAL_PAPERS = 120 (Prioritize existing CSV papers)
|
| 99 |
total_current = len(existing)
|
|
@@ -107,8 +137,18 @@ def save_papers(papers_json: str) -> str:
|
|
| 107 |
|
| 108 |
if final_new_rows:
|
| 109 |
pd.concat([existing, pd.DataFrame(final_new_rows)]).to_csv(CSV_PATH, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
enrich_doi.invoke({}) # Auto-fill DOIs for new papers
|
| 111 |
msg = f"Saved {len(final_new_rows)} papers. Total: {len(existing)+len(final_new_rows)}."
|
|
|
|
| 112 |
if discarded_count > 0: msg += f" (Discarded {discarded_count} papers for not aligning with AI & Societal Impact)."
|
| 113 |
if len(new_rows) > allowed_new: msg += f" (Truncated {len(new_rows) - allowed_new} exceeding cap)."
|
| 114 |
return msg
|
|
|
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
from sklearn.cluster import AgglomerativeClustering
|
| 10 |
|
| 11 |
+
# Use /data for persistence if running on HF Spaces with a mounted bucket, otherwise use local directory
|
| 12 |
+
ROOT_DIR = "/data" if os.path.exists("/data") else os.path.dirname(os.path.abspath(__file__))
|
|
|
|
| 13 |
|
| 14 |
+
CSV_PATH = os.path.join(ROOT_DIR, "papers.csv")
|
| 15 |
+
OUT_DIR = os.path.join(ROOT_DIR, "outputs")
|
| 16 |
+
PDF_DIR = os.path.join(ROOT_DIR, "pdfs")
|
| 17 |
+
SCRATCH_DIR = os.path.join(ROOT_DIR, "scratch")
|
| 18 |
+
for _d in [OUT_DIR, PDF_DIR, SCRATCH_DIR]: os.makedirs(_d, exist_ok=True)
|
| 19 |
|
| 20 |
+
HEADERS = ["Sr No", "Title", "DOI", "Web Link", "Authors", "Date of Publication", "Journal", "Abstract", "No of Citations"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
PAJAIS_TAXONOMY = {
|
| 23 |
"AI & Machine Learning": ["deep learning","neural networks","NLP","computer vision","reinforcement learning","transformers"],
|
|
|
|
| 42 |
|
| 43 |
@tool
|
| 44 |
def search_academic_source(query: str, source: str) -> str:
|
| 45 |
+
"""Search academic sources. Valid sources: 'google_scholar', 'arxiv', 'pubmed', 'hf_papers', 'tavily', 'apify', 'semantic_scholar', 'openalex', 'scopus', 'web_of_science'."""
|
| 46 |
try:
|
| 47 |
src = source.lower()
|
| 48 |
if "google" in src or "scholar" in src:
|
|
|
|
| 51 |
elif "arxiv" in src:
|
| 52 |
from langchain_community.utilities import ArxivAPIWrapper
|
| 53 |
return ArxivAPIWrapper(top_k_results=5).run(query)[:5000]
|
| 54 |
+
elif "pubmed" in src:
|
| 55 |
+
from langchain_community.utilities.pubmed import PubMedAPIWrapper
|
| 56 |
+
return PubMedAPIWrapper(top_k_results=5).run(query)[:5000]
|
| 57 |
+
elif "hf_papers" in src or "huggingface" in src:
|
| 58 |
+
res = requests.get(f"https://huggingface.co/api/papers/search?q={urllib.parse.quote(query)}").json()
|
| 59 |
+
return str(res[:5])[:5000] if isinstance(res, list) else "HF Search Error"
|
| 60 |
+
elif "apify" in src:
|
| 61 |
+
from langchain_community.utilities import ApifyWrapper
|
| 62 |
+
apify = ApifyWrapper()
|
| 63 |
+
return apify.run("apify/arxiv-scraper", run_input={"searchSubj": "all", "searchQuery": query, "maxResults": 5})[:5000]
|
| 64 |
+
elif "semantic_scholar" in src:
|
| 65 |
+
from langchain_community.utilities.semanticscholar import SemanticScholarAPIWrapper
|
| 66 |
+
return SemanticScholarAPIWrapper(top_k_results=5).run(query)[:5000]
|
| 67 |
+
elif "openalex" in src:
|
| 68 |
+
api_key = os.getenv("OPENALEX_API_KEY")
|
| 69 |
+
url = f"https://api.openalex.org/works?search={urllib.parse.quote(query)}&per-page=5"
|
| 70 |
+
headers = {"User-Agent": "AcademicAgent/1.0 (mailto:agent@research-platform.org)"}
|
| 71 |
+
if api_key: headers["api_key"] = api_key # Higher rate limits if key provided
|
| 72 |
+
res = requests.get(url, headers=headers).json()
|
| 73 |
+
return str(res.get("results", []))[:5000]
|
| 74 |
+
elif "scopus" in src:
|
| 75 |
+
api_key = os.getenv("SCOPUS_API_KEY")
|
| 76 |
+
if not api_key: return "Error: SCOPUS_API_KEY missing. Request the user to provide it or switch to 'semantic_scholar' or 'openalex'."
|
| 77 |
+
res = requests.get(f"https://api.elsevier.com/content/search/scopus?query={urllib.parse.quote(query)}&count=5", headers={"X-ELS-APIKey": api_key, "Accept": "application/json"}).json()
|
| 78 |
+
return str(res)[:5000]
|
| 79 |
+
elif "web_of_science" in src or "wos" in src:
|
| 80 |
+
api_key = os.getenv("WOS_API_KEY")
|
| 81 |
+
if not api_key: return "Error: WOS_API_KEY missing. Request the user to provide it or switch to 'semantic_scholar' or 'openalex'."
|
| 82 |
+
res = requests.get(f"https://wos-api.clarivate.com/api/wos?databaseId=WOS&usrQuery={urllib.parse.quote(query)}&count=5&firstRecord=1", headers={"X-ApiKey": api_key}).json()
|
| 83 |
+
return str(res)[:5000]
|
| 84 |
else:
|
| 85 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 86 |
+
return str(TavilySearchResults(max_results=5).invoke(f"{query} academic peer reviewed paper"))[:5000]
|
|
|
|
| 87 |
except Exception as e: return f"Error searching {source}: {str(e)}"
|
| 88 |
|
| 89 |
+
def _download_pdf(url: str, filename: str):
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
r = requests.get(url, stream=True, timeout=12)
|
| 92 |
if r.status_code == 200:
|
| 93 |
if not filename.lower().endswith(".pdf"): filename += ".pdf"
|
| 94 |
+
path = os.path.join(PDF_DIR, "".join(x for x in filename if x.isalnum() or x in "._- "))
|
| 95 |
with open(path, 'wb') as f:
|
| 96 |
for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
|
| 97 |
+
return True
|
| 98 |
+
except: pass
|
| 99 |
+
return False
|
| 100 |
|
| 101 |
@tool
|
| 102 |
def save_papers(papers_json: str) -> str:
|
|
|
|
| 115 |
impact_words = ["societ", "ethic", "bias", "fairness", "privacy", "impact", "policy", "govern", "regulation", "human"]
|
| 116 |
return any(w in txt for w in ai_words) and any(w in txt for w in impact_words)
|
| 117 |
|
| 118 |
+
# Strict Academic Filter: Discard Non-Papers
|
| 119 |
+
def is_research_paper(p):
|
| 120 |
+
txt = (str(p.get("Title", "")) + " " + str(p.get("Journal", "")) + " " + str(p.get("Abstract", ""))).lower()
|
| 121 |
+
banned_words = ["book", "chapter", "news", "editorial", "encyclopedia", "newspaper", "magazine"]
|
| 122 |
+
return not any(w in txt.split() for w in banned_words) # split() guarantees strict word matching
|
| 123 |
+
|
| 124 |
+
valid_items = [p for p in incoming if p.get("Title","").strip().lower() not in seen and is_relevant(p) and is_research_paper(p)]
|
| 125 |
+
discarded_count = len([p for p in incoming if p.get("Title","").strip().lower() not in seen and not (is_relevant(p) and is_research_paper(p))])
|
| 126 |
+
new_rows = valid_items
|
| 127 |
|
| 128 |
# Enforce MAX_TOTAL_PAPERS = 120 (Prioritize existing CSV papers)
|
| 129 |
total_current = len(existing)
|
|
|
|
| 137 |
|
| 138 |
if final_new_rows:
|
| 139 |
pd.concat([existing, pd.DataFrame(final_new_rows)]).to_csv(CSV_PATH, index=False)
|
| 140 |
+
|
| 141 |
+
# Auto-download PDFs for new rows
|
| 142 |
+
download_count = 0
|
| 143 |
+
for p in final_new_rows:
|
| 144 |
+
link = p.get("Web Link", "")
|
| 145 |
+
if link and str(link).lower().endswith(".pdf"):
|
| 146 |
+
if _download_pdf(link, p.get("Title", "paper")[:50]):
|
| 147 |
+
download_count += 1
|
| 148 |
+
|
| 149 |
enrich_doi.invoke({}) # Auto-fill DOIs for new papers
|
| 150 |
msg = f"Saved {len(final_new_rows)} papers. Total: {len(existing)+len(final_new_rows)}."
|
| 151 |
+
if download_count > 0: msg += f" (Archived {download_count} PDFs)."
|
| 152 |
if discarded_count > 0: msg += f" (Discarded {discarded_count} papers for not aligning with AI & Societal Impact)."
|
| 153 |
if len(new_rows) > allowed_new: msg += f" (Truncated {len(new_rows) - allowed_new} exceeding cap)."
|
| 154 |
return msg
|