|
|
| import requests
|
| import os
|
| from langchain.document_loaders import PDFMinerLoader
|
| from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| from langchain.vectorstores import FAISS
|
| from langchain_community.document_loaders import ArxivLoader
|
| from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| import faiss
|
| from langchain_community.docstore.in_memory import InMemoryDocstore
|
| from config.config import model
|
| import urllib.request as libreq
|
| import xml.etree.ElementTree as ET
|
|
|
| os.makedirs("papers", exist_ok=True)
|
|
|
|
|
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| index = faiss.IndexFlatL2(len(embeddings.embed_query("dummy query")))
|
|
|
| text_splitter = RecursiveCharacterTextSplitter(
|
| chunk_size=700,
|
| chunk_overlap=0,
|
| length_function=len
|
| )
|
|
|
| class SearchAgent:
|
| def __init__(self):
|
|
|
| self.model = model
|
| self.p = """You are an assistant designed to extract research topics or titles from user queries. When a user asks about a specific topic, identify the central subject of their query and provide a concise, clear title or topic related to that area of research. If the query refers to a particular research paper, include the paper's title, author(s), and publication year.
|
| Here are the instructions you should follow:
|
| General Topics: If the query mentions a general topic without referring to a specific paper, identify the primary research area or topic. For example, if the query is "What are the advancements in text-to-SQL models?" your response should be simply "Text-to-SQL Models."
|
| Specific Research Papers: If the query mentions a particular paper, extract the title, author(s), and year of the paper. For example, if the query is "What did the paper by John Doe in 2022 say about AI in healthcare?" your response should be "AI in Healthcare (John Doe, 2022)."
|
| Abstract or General Query: If the query is an abstract or general inquiry into a topic, return the main theme or title of that topic. For instance, "What are the advancements in natural language processing?" would result in "Natural Language Processing Advancements."
|
| Examples:
|
| User Query: "Tell me about recent advancements in text-to-SQL models." Response: "Text-to-SQL Models."
|
| User Query: "What does the paper 'Deep Learning for Text-to-SQL by Jane Smith, 2021' cover?" Response: "'Deep Learning for Text-to-SQL' (Jane Smith, 2021)."
|
| User Query: "Can you summarize the paper by Alice Brown on quantum computing from 2020?" Response: "'Quantum Computing: A New Frontier' (Alice Brown, 2020)." """
|
|
|
| def solve(self, task):
|
| print(f"Searching for information on: {task}")
|
| response = model.generate_content(self.p+task)
|
| query = response.text.strip()
|
|
|
| r=query.split(" ")
|
| query_="%20".join(r)
|
|
|
| with libreq.urlopen(f'''http://export.arxiv.org/api/query?search_query=all:{query_}&sortBy=relevance&sortOrder=descending&start=0&max_results=5''') as url:
|
| r = url.read()
|
|
|
|
|
|
|
| xml_content = r
|
| root = ET.fromstring(xml_content)
|
| ns = {'atom': 'http://www.w3.org/2005/Atom'}
|
| ids = [entry.find('atom:id', ns).text for entry in root.findall('atom:entry', ns)]
|
| pdf_urls = [url.replace("abs", "pdf") for url in ids]
|
|
|
|
|
| papers = []
|
|
|
|
|
| for entry in root.findall('atom:entry', ns):
|
| paper_info = {}
|
|
|
|
|
| title = entry.find('atom:title', ns).text
|
| paper_info['title'] = title
|
|
|
|
|
| paper_id = entry.find('atom:id', ns).text
|
| pdf_link = paper_id.replace("abs", "pdf")
|
| paper_info['link'] = pdf_link
|
|
|
|
|
| published = entry.find('atom:published', ns).text
|
| year = published[:4]
|
| paper_info['year'] = year
|
|
|
| papers.append(paper_info)
|
|
|
| all_papers = []
|
|
|
| def download_pdf_paper_from_url(url):
|
| paper_number = os.path.basename(url).strip(".pdf")
|
| res = requests.get(url)
|
| pdf_path = f"papers/{paper_number}.pdf"
|
| with open(pdf_path, 'wb') as f:
|
| f.write(res.content)
|
| return paper_number
|
|
|
| for paper in papers:
|
| paper_number = download_pdf_paper_from_url(paper['link'])
|
| all_papers.append(paper_number)
|
|
|
| paper['paper_number'] = paper_number
|
|
|
| vector_db = FAISS(
|
| embedding_function=embeddings,
|
| index=index,
|
| docstore=InMemoryDocstore(),
|
| index_to_docstore_id={}
|
| )
|
|
|
| for pdf_number in all_papers:
|
| docs = ArxivLoader(query=pdf_number)
|
| docs = PDFMinerLoader(f"papers/{pdf_number}.pdf").load()
|
| docs = text_splitter.split_documents(docs)
|
| vector_db.add_documents(docs)
|
|
|
| vector_db.save_local("vector_db", index_name="base_and_adjacent")
|
|
|
| return ["Here are the papers on" + query] , papers
|
|
|
|
|