Spaces:
Runtime error
Runtime error
| # Each `paper` is a dictionary containing: | |
| # (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings | |
| # | |
| # Generate references: | |
| # `Reference` class: | |
| # 1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract. | |
| # 2. Given some keywords; use Semantic Scholar API to find papers. | |
| # 3. Generate bibtex from the selected papers. --> to_bibtex() | |
| # 4. Generate prompts from the selected papers: --> to_prompts() | |
| # A sample prompt: {"paper_id": "paper summary"} | |
| # todo: (1) citations & citedby of provided papers: | |
| # load the pre-defined papers; use S2 to find all related works | |
| # add all citations to `bib_papers` | |
| # add all citedby to `bib_papers` | |
| # use Semantic Scholar to find their embeddings | |
| # (2) separate references: | |
| # divide references into different groups to reduce the tokens count | |
| # for generating different paragraph of related works, use different set of references | |
| import requests | |
| import re | |
| import bibtexparser | |
| from scholarly import scholarly | |
| from scholarly import ProxyGenerator | |
| ###################################################################################################################### | |
| # Some basic tools | |
| ###################################################################################################################### | |
| def remove_newlines(serie): | |
| # This function is applied to the abstract of each paper to reduce the length of prompts. | |
| serie = serie.replace('\n', ' ') | |
| serie = serie.replace('\\n', ' ') | |
| serie = serie.replace(' ', ' ') | |
| serie = serie.replace(' ', ' ') | |
| return serie | |
| def search_paper_abstract(title): | |
| pg = ProxyGenerator() | |
| success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") | |
| if success: | |
| scholarly.use_proxy(pg) | |
| # input the title of a paper, return its abstract | |
| search_query = scholarly.search_pubs(title) | |
| found_paper = next(search_query) | |
| else: | |
| raise RuntimeError("ScraperAPI fails.") | |
| return remove_newlines(found_paper['bib']['abstract']) | |
| def load_papers_from_bibtex(bib_file_path): | |
| with open(bib_file_path) as bibtex_file: | |
| bib_database = bibtexparser.load(bibtex_file) | |
| if len(bib_database.entries) == 0: | |
| return [] | |
| else: | |
| bib_papers = [] | |
| for bibitem in bib_database.entries: | |
| # Add each paper to `bib_papers` | |
| paper_id = bibitem.get("ID") | |
| title = bibitem.get("title") | |
| if title is None: | |
| continue | |
| journal = bibitem.get("journal") | |
| year = bibitem.get("year") | |
| author = bibitem.get("author") | |
| abstract = bibitem.get("abstract") | |
| if abstract is None: | |
| abstract = search_paper_abstract(title) | |
| result = { | |
| "paper_id": paper_id, | |
| "title": title, | |
| "link": "", | |
| "abstract": abstract, | |
| "authors": author, | |
| "year": year, | |
| "journal": journal | |
| } | |
| bib_papers.append(result) | |
| return bib_papers | |
| ###################################################################################################################### | |
| # Semantic Scholar (SS) API | |
| ###################################################################################################################### | |
| def ss_search(keywords, limit=20, fields=None): | |
| # space between the query to be removed and replaced with + | |
| if fields is None: | |
| fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"] | |
| keywords = keywords.lower() | |
| keywords = keywords.replace(" ", "+") | |
| url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}' | |
| # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY} | |
| headers = {"Accept": "*/*"} | |
| response = requests.get(url, headers=headers, timeout=30) | |
| return response.json() | |
| def _collect_papers_ss(keyword, counts=3, tldr=False): | |
| def externalIds2link(externalIds): | |
| # Sample externalIds: | |
| # "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}" | |
| if externalIds: | |
| # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI | |
| # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral) | |
| # DBLP | |
| dblp_id = externalIds.get('DBLP') | |
| if dblp_id is not None: | |
| dblp_link = f"dblp.org/rec/{dblp_id}" | |
| return dblp_link | |
| # arXiv | |
| arxiv_id = externalIds.get('ArXiv') | |
| if arxiv_id is not None: | |
| arxiv_link = f"arxiv.org/abs/{arxiv_id}" | |
| return arxiv_link | |
| return "" | |
| else: | |
| # if this is an empty dictionary, return an empty string | |
| return "" | |
| def extract_paper_id(last_name, year_str, title): | |
| pattern = r'^\w+' | |
| words = re.findall(pattern, title) | |
| # return last_name + year_str + title.split(' ', 1)[0] | |
| try: | |
| output = last_name + year_str + words[0] | |
| except IndexError: | |
| output = last_name + year_str + title[:4] | |
| return output | |
| def extract_author_info(raw_authors): | |
| authors = [author['name'] for author in raw_authors] | |
| authors_str = " and ".join(authors) | |
| try: | |
| last_name = authors[0].split()[-1] | |
| last_name = last_name.replace("'", "") | |
| except IndexError: | |
| last_name = "ma" | |
| # pattern = r'^\w+' | |
| # last_name = re.findall(pattern, authors[0]) | |
| return authors_str, last_name | |
| def parse_search_results(search_results_ss): | |
| if len(search_results_ss) == 0: | |
| return [] | |
| # turn the search result to a list of paper dictionary. | |
| papers_ss = [] | |
| for raw_paper in search_results_ss: | |
| if raw_paper["abstract"] is None: | |
| continue | |
| authors_str, last_name = extract_author_info(raw_paper['authors']) | |
| year_str = str(raw_paper['year']) | |
| title = raw_paper['title'] | |
| # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting} | |
| journal = raw_paper['venue'].replace("&", "\\&") | |
| if not journal: | |
| journal = "arXiv preprint" | |
| paper_id = extract_paper_id(last_name, year_str, title).lower() | |
| link = externalIds2link(raw_paper['externalIds']) | |
| if tldr and raw_paper['tldr'] is not None: | |
| abstract = raw_paper['tldr']['text'] | |
| else: | |
| abstract = remove_newlines(raw_paper['abstract']) | |
| # some papers have no embeddings; handle this case | |
| embeddings_dict = raw_paper.get('embedding') | |
| if embeddings_dict is None: | |
| continue | |
| else: | |
| embeddings = raw_paper['embedding']['vector'] | |
| result = { | |
| "paper_id": paper_id, | |
| "title": title, | |
| "abstract": abstract, | |
| "link": link, | |
| "authors": authors_str, | |
| "year": year_str, | |
| "journal": journal, | |
| "embeddings": embeddings | |
| } | |
| papers_ss.append(result) | |
| return papers_ss | |
| raw_results = ss_search(keyword, limit=counts) | |
| if raw_results is not None: | |
| search_results = raw_results.get("data") | |
| if search_results is None: | |
| search_results = [] | |
| else: | |
| search_results = [] | |
| results = parse_search_results(search_results) | |
| return results | |
| ###################################################################################################################### | |
| # References Class | |
| ###################################################################################################################### | |
| class References: | |
| def __init__(self, title, load_papers): | |
| if load_papers is not None: | |
| self.papers = {} | |
| self.papers["customized_refs"] = load_papers_from_bibtex(load_papers) | |
| else: | |
| self.papers = {} | |
| self.title = title | |
| def load_papers(self, bibtex, keyword): | |
| self.papers[keyword] = load_papers_from_bibtex(bibtex) | |
| def generate_keywords_dict(self): | |
| keywords_dict = {} | |
| for k in self.papers: | |
| keywords_dict[k] = len(self.papers[k]) | |
| return keywords_dict | |
| def collect_papers(self, keywords_dict, tldr=False): | |
| """ | |
| keywords_dict: | |
| {"machine learning": 5, "language model": 2}; | |
| the first is the keyword, the second is how many references are needed. | |
| """ | |
| for key, counts in keywords_dict.items(): | |
| self.papers[key] = _collect_papers_ss(key, counts, tldr) | |
| def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50): | |
| """ | |
| Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`. | |
| """ | |
| # todo: | |
| # use embeddings to evaluate; keep top k relevant references in papers | |
| # send (title, .bib file) to evaluate embeddings; recieve truncated papers | |
| papers = self._get_papers(keyword = "_all") | |
| # clear the bibtex file | |
| with open(path_to_bibtex, "w", encoding="utf-8") as file: | |
| file.write("") | |
| bibtex_entries = [] | |
| paper_ids = [] | |
| seen = set() | |
| for paper in papers: | |
| if paper["paper_id"] in seen: | |
| continue | |
| else: | |
| seen.add(paper["paper_id"]) | |
| bibtex_entry = f"""@article{{{paper["paper_id"]}, | |
| title = {{{paper["title"]}}}, | |
| author = {{{paper["authors"]}}}, | |
| journal={{{paper["journal"]}}}, | |
| year = {{{paper["year"]}}}, | |
| url = {{{paper["link"]}}} | |
| }}""" | |
| bibtex_entries.append(bibtex_entry) | |
| paper_ids.append(paper["paper_id"]) | |
| # Save the generated BibTeX entries to a file | |
| with open(path_to_bibtex, "a", encoding="utf-8") as file: | |
| file.write(bibtex_entry) | |
| file.write("\n\n") | |
| return paper_ids | |
| def _get_papers(self, keyword = "_all"): | |
| if keyword == "_all": | |
| papers = [] | |
| for k, v in self.papers.items(): | |
| papers = papers + v | |
| else: | |
| papers = self.papers["keyword"] | |
| return papers | |
| def to_prompts(self, keyword = "_all"): | |
| # `prompts`: | |
| # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"} | |
| # this will be used to instruct GPT model to cite the correct bibtex entry. | |
| papers = self._get_papers(keyword) | |
| prompts = {} | |
| for paper in papers: | |
| prompts[paper["paper_id"]] = paper["abstract"] | |
| return prompts | |
| def to_json(self, keyword = "_all"): | |
| papers = self._get_papers(keyword) | |
| papers_json = {} | |
| for paper in papers: | |
| papers_json[paper["paper_id"]] = paper | |
| return papers_json | |
| if __name__ == "__main__": | |
| # testing search results | |
| r = ss_search("Deep Q-Networks", limit=1) # a list of raw papers | |
| if r['total'] > 0: | |
| paper = r['data'][0] | |
| # print(paper) | |
| # resting References | |
| refs = References() | |
| # keywords_dict = { | |
| # "Deep Q-Networks": 5, | |
| # "Actor-Critic Algorithms": 4, | |
| # "Exploration-Exploitation Trade-off": 3 | |
| # } | |
| # refs.collect_papers(keywords_dict, tldr=True) | |
| # for k in refs.papers: | |
| # papers = refs.papers[k] # for each keyword, there is a list of papers | |
| # print("keyword: ", k) | |
| # for paper in papers: | |
| # print(paper["paper_id"]) | |
| # | |
| # refs.to_bibtex() | |
| # papers_json = refs.to_json() # this json can be used to find the most relevant papers | |
| # with open("papers.json", "w", encoding='utf-8') as text_file: | |
| # text_file.write(f"{papers_json}") | |
| # | |
| # prompts = refs.to_prompts() | |
| # print(prompts) | |
| bib = "test.bib" | |
| refs.load_papers(bib, "variance-reduction rl") | |
| print(refs.papers) | |
| prompts = refs.to_prompts() | |
| for k in prompts: | |
| print(f"{k}: {prompts[k]}\n") | |
| # for paper in papers: | |
| # print(paper) | |