| | import datetime |
| | import math |
| |
|
| | from datasets import load_dataset |
| | from sentence_transformers import SentenceTransformer |
| |
|
| | import gradio as gr |
| |
|
| | def boolean_search(paragraph, query): |
| | |
| | words = paragraph.lower().split() |
| | words_dict = dict.fromkeys(words, True) |
| |
|
| | |
| | query_words = query.lower().split() |
| |
|
| | result = words_dict.get(query_words[0], False) |
| |
|
| | for i in range(1, len(query_words), 2): |
| | operator = query_words[i] |
| | operand = words_dict.get(query_words[i + 1], False) |
| |
|
| | if operator == 'and': |
| | result = result and operand |
| | elif operator == 'or': |
| | result = result or operand |
| | elif operator == 'not': |
| | result = result and not operand |
| |
|
| | return result |
| |
|
| | def parse_retrieved(retrieved_examples,scores,filters,k): |
| |
|
| | results=[] |
| | repo_avail,in_date,boolmet=len(scores),len(scores),len(scores) |
| |
|
| | for i in range(len(scores)): |
| |
|
| | resdict={} |
| | for key in keys: |
| | resdict[key] = retrieved_examples[key][i] |
| | resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i]) |
| | resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i]) |
| | resdict['published'] = retrieved_examples['versions'][0][0]['created'] |
| | resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year |
| | resdict['score'] = str(round(scores[i],3))[:5] |
| | relevant=True |
| |
|
| | if resdict['repo_url']==None: |
| | repo_avail-=1 |
| | resdict['repo_url']="" |
| | if filters['limit2_pwc']: |
| | relevant=False |
| |
|
| | if filters['sy']>resdict['year'] or filters['ey']<resdict['year']: |
| | relevant=False |
| | in_date-=1 |
| | print(filters['boolean_terms']) |
| | if filters['boolean_terms']!="": |
| | boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms']) |
| | if not boolean_met: |
| | relevant=False |
| | boolmet-=1 |
| |
|
| | if relevant: |
| | results.append(resdict) |
| |
|
| | return [results[:k],repo_avail,in_date,boolmet] |
| |
|
| | def create_metadata_html(metadata_dict): |
| | html = ''' |
| | <div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;"> |
| | <h2>{title}</h2> |
| | <pre><p><strong>Relevance_score:</strong> {score} <strong>Published:</strong> {published}</p></pre> |
| | <p><strong>Authors:</strong> {authors}</p> |
| | <pre><p><strong>Categories:</strong> {categories} <strong>Year:</strong> {year}</p></pre> |
| | <pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a> <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre> |
| | <p><strong>Abstract:</strong> {abstract}</p> |
| | <p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p> |
| | </div> |
| | ''' |
| | return html.format(**metadata_dict) |
| |
|
| | def search(query, boolean_terms, sy, ey,limit2_pwc): |
| |
|
| | k=30 |
| |
|
| | question_embedding = model.encode(query) |
| | scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100) |
| |
|
| | filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms} |
| |
|
| | results = parse_retrieved(retrieved_examples,scores,filters,k) |
| |
|
| | divs=[create_metadata_html(r) for r in results[0]] |
| | divs.reverse() |
| |
|
| | html="<br><br><pre><strong>Articles with Repo:</strong> {} <strong>Articles in date range:</strong> {} <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 30 results returned</strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs) |
| | return html |
| |
|
| |
|
| | global keys |
| | keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper'] |
| |
|
| |
|
| | ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6") |
| | ds['train'].add_faiss_index(column='embeddings') |
| |
|
| | model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
| |
|
| |
|
| | search_interface = gr.Blocks() |
| |
|
| | with search_interface: |
| | fn = search, |
| | inputs=[ |
| | gr.Textbox(label="Query",value="",info="Search Query"), |
| | gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"), |
| | gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"), |
| | gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"), |
| | gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc") |
| | ] |
| | run = gr.Button(label="Search") |
| | examples=[ |
| | ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True], |
| | ] |
| | output=gr.outputs.HTML() |
| | run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search") |
| |
|
| | search_interface.launch() |
| |
|