shahidshaikh commited on
Commit
df4deee
·
verified ·
1 Parent(s): 3020bbd

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +64 -24
tools.py CHANGED
@@ -8,20 +8,16 @@ from sklearn.decomposition import PCA
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from sklearn.cluster import AgglomerativeClustering
10
 
11
- # --- Persistent Storage Configuration ---
12
- # Hugging Face Spaces typically mounts persistent data to /data
13
- PERSISTENT_BASE = "/data" if os.path.exists("/data") else os.path.dirname(os.path.abspath(__file__))
14
 
15
- CSV_PATH = os.path.join(PERSISTENT_BASE, "papers.csv")
16
- OUT_DIR = os.path.join(PERSISTENT_BASE, "outputs")
17
- PDF_DIR = os.path.join(PERSISTENT_BASE, "pdfs")
18
- SCRATCH_DIR = os.path.join(PERSISTENT_BASE, "scratch")
 
19
 
20
- # Ensure all persistent directories exist
21
- for _d in [OUT_DIR, PDF_DIR, SCRATCH_DIR]:
22
- os.makedirs(_d, exist_ok=True)
23
-
24
- HEADERS = ["Sr No", "Title", "DOI", "Web Link", "Authors", "Date of Publication", "Journal", "Abstract", "No of Citations", "Keywords"]
25
 
26
  PAJAIS_TAXONOMY = {
27
  "AI & Machine Learning": ["deep learning","neural networks","NLP","computer vision","reinforcement learning","transformers"],
@@ -46,7 +42,7 @@ def _embed():
46
 
47
  @tool
48
  def search_academic_source(query: str, source: str) -> str:
49
- """Search academic sources. Valid sources: 'google_scholar', 'arxiv', 'tavily'."""
50
  try:
51
  src = source.lower()
52
  if "google" in src or "scholar" in src:
@@ -55,25 +51,52 @@ def search_academic_source(query: str, source: str) -> str:
55
  elif "arxiv" in src:
56
  from langchain_community.utilities import ArxivAPIWrapper
57
  return ArxivAPIWrapper(top_k_results=5).run(query)[:5000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
  from langchain_community.tools.tavily_search import TavilySearchResults
60
- # Tavily is an LLM-friendly in-built web searcher
61
- return str(TavilySearchResults(max_results=5).invoke(f"{query} academic paper abstract"))[:5000]
62
  except Exception as e: return f"Error searching {source}: {str(e)}"
63
 
64
- @tool
65
- def download_pdf(url: str, filename: str) -> str:
66
- """Download a PDF from the given URL and save it to the pdfs/ folder. Ensure filename ends with .pdf."""
67
  try:
68
  r = requests.get(url, stream=True, timeout=12)
69
  if r.status_code == 200:
70
  if not filename.lower().endswith(".pdf"): filename += ".pdf"
71
- path = os.path.join(PDF_DIR, filename)
72
  with open(path, 'wb') as f:
73
  for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
74
- return f"Success: Downloaded to {path}"
75
- return f"Failed: HTTP {r.status_code}"
76
- except Exception as e: return f"Download error: {e}"
77
 
78
  @tool
79
  def save_papers(papers_json: str) -> str:
@@ -92,8 +115,15 @@ def save_papers(papers_json: str) -> str:
92
  impact_words = ["societ", "ethic", "bias", "fairness", "privacy", "impact", "policy", "govern", "regulation", "human"]
93
  return any(w in txt for w in ai_words) and any(w in txt for w in impact_words)
94
 
95
- new_rows = [p for p in incoming if p.get("Title","").strip().lower() not in seen and is_relevant(p)]
96
- discarded_count = len([p for p in incoming if p.get("Title","").strip().lower() not in seen and not is_relevant(p)])
 
 
 
 
 
 
 
97
 
98
  # Enforce MAX_TOTAL_PAPERS = 120 (Prioritize existing CSV papers)
99
  total_current = len(existing)
@@ -107,8 +137,18 @@ def save_papers(papers_json: str) -> str:
107
 
108
  if final_new_rows:
109
  pd.concat([existing, pd.DataFrame(final_new_rows)]).to_csv(CSV_PATH, index=False)
 
 
 
 
 
 
 
 
 
110
  enrich_doi.invoke({}) # Auto-fill DOIs for new papers
111
  msg = f"Saved {len(final_new_rows)} papers. Total: {len(existing)+len(final_new_rows)}."
 
112
  if discarded_count > 0: msg += f" (Discarded {discarded_count} papers for not aligning with AI & Societal Impact)."
113
  if len(new_rows) > allowed_new: msg += f" (Truncated {len(new_rows) - allowed_new} exceeding cap)."
114
  return msg
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  from sklearn.cluster import AgglomerativeClustering
10
 
11
+ # Use /data for persistence if running on HF Spaces with a mounted bucket, otherwise use local directory
12
+ ROOT_DIR = "/data" if os.path.exists("/data") else os.path.dirname(os.path.abspath(__file__))
 
13
 
14
+ CSV_PATH = os.path.join(ROOT_DIR, "papers.csv")
15
+ OUT_DIR = os.path.join(ROOT_DIR, "outputs")
16
+ PDF_DIR = os.path.join(ROOT_DIR, "pdfs")
17
+ SCRATCH_DIR = os.path.join(ROOT_DIR, "scratch")
18
+ for _d in [OUT_DIR, PDF_DIR, SCRATCH_DIR]: os.makedirs(_d, exist_ok=True)
19
 
20
+ HEADERS = ["Sr No", "Title", "DOI", "Web Link", "Authors", "Date of Publication", "Journal", "Abstract", "No of Citations"]
 
 
 
 
21
 
22
  PAJAIS_TAXONOMY = {
23
  "AI & Machine Learning": ["deep learning","neural networks","NLP","computer vision","reinforcement learning","transformers"],
 
42
 
43
  @tool
44
  def search_academic_source(query: str, source: str) -> str:
45
+ """Search academic sources. Valid sources: 'google_scholar', 'arxiv', 'pubmed', 'hf_papers', 'tavily', 'apify', 'semantic_scholar', 'openalex', 'scopus', 'web_of_science'."""
46
  try:
47
  src = source.lower()
48
  if "google" in src or "scholar" in src:
 
51
  elif "arxiv" in src:
52
  from langchain_community.utilities import ArxivAPIWrapper
53
  return ArxivAPIWrapper(top_k_results=5).run(query)[:5000]
54
+ elif "pubmed" in src:
55
+ from langchain_community.utilities.pubmed import PubMedAPIWrapper
56
+ return PubMedAPIWrapper(top_k_results=5).run(query)[:5000]
57
+ elif "hf_papers" in src or "huggingface" in src:
58
+ res = requests.get(f"https://huggingface.co/api/papers/search?q={urllib.parse.quote(query)}").json()
59
+ return str(res[:5])[:5000] if isinstance(res, list) else "HF Search Error"
60
+ elif "apify" in src:
61
+ from langchain_community.utilities import ApifyWrapper
62
+ apify = ApifyWrapper()
63
+ return apify.run("apify/arxiv-scraper", run_input={"searchSubj": "all", "searchQuery": query, "maxResults": 5})[:5000]
64
+ elif "semantic_scholar" in src:
65
+ from langchain_community.utilities.semanticscholar import SemanticScholarAPIWrapper
66
+ return SemanticScholarAPIWrapper(top_k_results=5).run(query)[:5000]
67
+ elif "openalex" in src:
68
+ api_key = os.getenv("OPENALEX_API_KEY")
69
+ url = f"https://api.openalex.org/works?search={urllib.parse.quote(query)}&per-page=5"
70
+ headers = {"User-Agent": "AcademicAgent/1.0 (mailto:agent@research-platform.org)"}
71
+ if api_key: headers["api_key"] = api_key # Higher rate limits if key provided
72
+ res = requests.get(url, headers=headers).json()
73
+ return str(res.get("results", []))[:5000]
74
+ elif "scopus" in src:
75
+ api_key = os.getenv("SCOPUS_API_KEY")
76
+ if not api_key: return "Error: SCOPUS_API_KEY missing. Request the user to provide it or switch to 'semantic_scholar' or 'openalex'."
77
+ res = requests.get(f"https://api.elsevier.com/content/search/scopus?query={urllib.parse.quote(query)}&count=5", headers={"X-ELS-APIKey": api_key, "Accept": "application/json"}).json()
78
+ return str(res)[:5000]
79
+ elif "web_of_science" in src or "wos" in src:
80
+ api_key = os.getenv("WOS_API_KEY")
81
+ if not api_key: return "Error: WOS_API_KEY missing. Request the user to provide it or switch to 'semantic_scholar' or 'openalex'."
82
+ res = requests.get(f"https://wos-api.clarivate.com/api/wos?databaseId=WOS&usrQuery={urllib.parse.quote(query)}&count=5&firstRecord=1", headers={"X-ApiKey": api_key}).json()
83
+ return str(res)[:5000]
84
  else:
85
  from langchain_community.tools.tavily_search import TavilySearchResults
86
+ return str(TavilySearchResults(max_results=5).invoke(f"{query} academic peer reviewed paper"))[:5000]
 
87
  except Exception as e: return f"Error searching {source}: {str(e)}"
88
 
89
+ def _download_pdf(url: str, filename: str):
 
 
90
  try:
91
  r = requests.get(url, stream=True, timeout=12)
92
  if r.status_code == 200:
93
  if not filename.lower().endswith(".pdf"): filename += ".pdf"
94
+ path = os.path.join(PDF_DIR, "".join(x for x in filename if x.isalnum() or x in "._- "))
95
  with open(path, 'wb') as f:
96
  for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
97
+ return True
98
+ except: pass
99
+ return False
100
 
101
  @tool
102
  def save_papers(papers_json: str) -> str:
 
115
  impact_words = ["societ", "ethic", "bias", "fairness", "privacy", "impact", "policy", "govern", "regulation", "human"]
116
  return any(w in txt for w in ai_words) and any(w in txt for w in impact_words)
117
 
118
+ # Strict Academic Filter: Discard Non-Papers
119
+ def is_research_paper(p):
120
+ txt = (str(p.get("Title", "")) + " " + str(p.get("Journal", "")) + " " + str(p.get("Abstract", ""))).lower()
121
+ banned_words = ["book", "chapter", "news", "editorial", "encyclopedia", "newspaper", "magazine"]
122
+ return not any(w in txt.split() for w in banned_words) # split() guarantees strict word matching
123
+
124
+ valid_items = [p for p in incoming if p.get("Title","").strip().lower() not in seen and is_relevant(p) and is_research_paper(p)]
125
+ discarded_count = len([p for p in incoming if p.get("Title","").strip().lower() not in seen and not (is_relevant(p) and is_research_paper(p))])
126
+ new_rows = valid_items
127
 
128
  # Enforce MAX_TOTAL_PAPERS = 120 (Prioritize existing CSV papers)
129
  total_current = len(existing)
 
137
 
138
  if final_new_rows:
139
  pd.concat([existing, pd.DataFrame(final_new_rows)]).to_csv(CSV_PATH, index=False)
140
+
141
+ # Auto-download PDFs for new rows
142
+ download_count = 0
143
+ for p in final_new_rows:
144
+ link = p.get("Web Link", "")
145
+ if link and str(link).lower().endswith(".pdf"):
146
+ if _download_pdf(link, p.get("Title", "paper")[:50]):
147
+ download_count += 1
148
+
149
  enrich_doi.invoke({}) # Auto-fill DOIs for new papers
150
  msg = f"Saved {len(final_new_rows)} papers. Total: {len(existing)+len(final_new_rows)}."
151
+ if download_count > 0: msg += f" (Archived {download_count} PDFs)."
152
  if discarded_count > 0: msg += f" (Discarded {discarded_count} papers for not aligning with AI & Societal Impact)."
153
  if len(new_rows) > allowed_new: msg += f" (Truncated {len(new_rows) - allowed_new} exceeding cap)."
154
  return msg