File size: 7,056 Bytes
9cbe032
 
 
 
8be0ea8
2a0549d
9cbe032
 
2a0549d
9cbe032
4ff251a
8be0ea8
 
 
4ff251a
8be0ea8
9cbe032
a532e37
56decc1
9cbe032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a532e37
 
9cbe032
a532e37
9cbe032
a532e37
8be0ea8
a532e37
2a0549d
9cbe032
 
 
 
 
2a0549d
9cbe032
2a0549d
f0d3013
a532e37
9cbe032
a532e37
8be0ea8
2a0549d
9cbe032
 
2a0549d
f0d3013
2a0549d
f0d3013
a532e37
 
8be0ea8
a532e37
8be0ea8
9cbe032
 
 
8be0ea8
a532e37
 
8be0ea8
a532e37
 
8be0ea8
9cbe032
2a0549d
 
 
 
 
8be0ea8
9cbe032
2b86972
8be0ea8
 
 
f786bd8
 
 
2a0549d
 
 
 
 
f786bd8
8be0ea8
a141365
f0d3013
 
f786bd8
a141365
f0d3013
a532e37
253753f
43c7a1a
3bbb339
fece03e
8be0ea8
 
f0d3013
 
 
2a0549d
 
 
f0d3013
 
a532e37
 
8be0ea8
a532e37
8be0ea8
a532e37
 
 
 
 
 
 
2a0549d
 
 
 
 
a532e37
 
 
 
 
 
 
 
 
 
 
9cbe032
a532e37
 
 
 
2a0549d
 
 
 
a532e37
 
 
 
 
 
 
 
 
 
 
1145323
8be0ea8
f0d3013
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
from typing import List
from dataclasses import dataclass

import chainlit as cl
import requests
import feedparser
from dotenv import load_dotenv

# LangChain bits (unchanged)
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

load_dotenv()

ARXIV_API = "https://export.arxiv.org/api/query"

# ---------- Simple paper container (drop-in replacement for arxiv.Result we used) ----------
@dataclass
class Paper:
    title: str
    summary: str
    comment: str
    entry_id: str
    authors: List[str]

# ---------- Direct arXiv API fetch (HTTPS + custom UA) ----------
def fetch_arxiv_papers(query: str, max_results: int = 5) -> List[Paper]:
    params = {
        "search_query": query,
        "id_list": "",
        "sortBy": "relevance",
        "sortOrder": "descending",
        "start": 0,
        "max_results": max_results,
    }
    headers = {
        "User-Agent": f"arxiv-chainlit-app/1.0 (mailto:{os.getenv('CONTACT_EMAIL','noreply@example.com')})",
        "Accept": "application/atom+xml",
    }

    resp = requests.get(ARXIV_API, params=params, headers=headers, timeout=20)
    # Raise on non-200 so we can show a friendly error
    resp.raise_for_status()

    feed = feedparser.parse(resp.text)
    papers: List[Paper] = []
    for e in feed.entries:
        title = getattr(e, "title", "").strip()
        summary = getattr(e, "summary", "").strip()
        comment = getattr(e, "arxiv_comment", "") if hasattr(e, "arxiv_comment") else ""
        entry_id = getattr(e, "id", getattr(e, "link", ""))
        authors = [a.get("name", "").strip() for a in getattr(e, "authors", [])]
        papers.append(Paper(title=title, summary=summary, comment=comment, entry_id=entry_id, authors=authors or ["Unknown"]))
    return papers

# ---------- Your assistant, unchanged logic but using the new fetcher ----------
class ArxivResearchAssistant:
    def __init__(self):
        self.selected_paper: Paper | None = None
        self.qa_chain = None
        self.papers: List[Paper] = []
        self.state = "SEARCH"

    async def search_papers(self, query: str):
        try:
            self.papers = fetch_arxiv_papers(query, max_results=5)
        except requests.HTTPError as e:
            # Shows the real HTTP status & message (e.g., if UA missing or rate-limited)
            await cl.Message(content=f"Error talking to arXiv (HTTP {e.response.status_code}): {e.response.text[:200]}").send()
            return None
        except Exception as e:
            await cl.Message(content=f"Error talking to arXiv: {e}").send()
            return None

        if not self.papers:
            await cl.Message(content="No papers found. Please try another search query.").send()
            return None

        paper_list = "\n".join([
            f"{i+1}. {p.title} - {p.authors[0]}\nLink: {p.entry_id}"
            for i, p in enumerate(self.papers)
        ])
        await cl.Message(
            content=f"Please select a paper by entering its number:\n\n{paper_list}\n\nEnter the number of the paper you want to select:"
        ).send()
        self.state = "SELECT"
        return self.papers

    async def select_paper(self, selection: str):
        try:
            idx = int(selection) - 1
            if 0 <= idx < len(self.papers):
                self.selected_paper = self.papers[idx]
            else:
                await cl.Message(content="Invalid selection. Please try again.").send()
                return None
        except ValueError:
            await cl.Message(content="Invalid input. Please enter a number.").send()
            return None

        # Compose the text from the feed fields
        paper_text = (
            f"{self.selected_paper.title}\n\n"
            f"{self.selected_paper.summary}\n\n"
            f"{self.selected_paper.comment or ''}"
        )

        # Split, embed, index (unchanged)
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        chunks = text_splitter.split_text(paper_text)

        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_texts(
            chunks,
            embeddings,
            metadatas=[{
                "title": self.selected_paper.title,
                "link": self.selected_paper.entry_id,
                "chunk": f"Chunk {i+1}/{len(chunks)}"
            } for i in range(len(chunks))]
        )

        memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True,
            output_key="answer"
        )

        self.qa_chain = ConversationalRetrievalChain.from_llm(
            ChatOpenAI(temperature=0, model="gpt-4o-mini"),
            vectorstore.as_retriever(),
            memory=memory,
            return_source_documents=True
        )

        await cl.Message(
            content=(
                f"Selected paper: {self.selected_paper.title}\n"
                f"Link: {self.selected_paper.entry_id}\n\n"
                f"You can now ask questions about this paper. "
                f"Type 'new search' when you want to search for a different paper."
            )
        ).send()
        self.state = "QA"
        return self.selected_paper

    async def process_question(self, message: str):
        if message.lower() == "new search":
            self.reset()
            await cl.Message(content="Sure! Please enter a new search query for arXiv papers.").send()
            return None

        response = self.qa_chain({"question": message})
        answer = response["answer"]

        sources = "\n".join([
            f"- {doc.metadata.get('title','Unknown title')} "
            f"({doc.metadata.get('link','No link')}) - {doc.metadata.get('chunk','No chunk info')}"
            for doc in response.get("source_documents", [])
        ])
        if sources:
            answer += f"\n\nSources:\n{sources}"

        return answer

    def reset(self):
        self.selected_paper = None
        self.qa_chain = None
        self.papers = []
        self.state = "SEARCH"

# Global assistant instance
assistant = ArxivResearchAssistant()

@cl.on_chat_start
async def start():
    await cl.Message(content=(
        "Welcome! This tool helps you search for papers on arXiv, pick one, and ask questions about its content.\n\n"
        "Please enter a topic to search for on arXiv papers."
    )).send()

@cl.on_message
async def main(message: cl.Message):
    if assistant.state == "SEARCH":
        await assistant.search_papers(message.content)
    elif assistant.state == "SELECT":
        await assistant.select_paper(message.content)
    elif assistant.state == "QA":
        answer = await assistant.process_question(message.content)
        if answer:
            await cl.Message(content=answer).send()

if __name__ == "__main__":
    cl.run()