RCaz commited on
Commit
bcfa0c8
·
1 Parent(s): 5821a0e

modification for production

Browse files
DESCRIPTION.md CHANGED
@@ -234,4 +234,17 @@ Medical question answering
234
 
235
  Literature reviews
236
 
237
- Automated extraction pipelines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  Literature reviews
236
 
237
+ Automated extraction pipelines
238
+
239
+
240
+ ## Git branches
241
+
242
+ main : main branch to merge development
243
+ dev : auxiliary branches to add components
244
+ production : branch to push on huggingface space [specific remote branch]
245
+
246
+ Changes for productio includes:
247
+ - Guard function to insure clinical trials topic
248
+ - PATCH OpenInference
249
+ - disbale tqmd
250
+ - patch
agent.py CHANGED
@@ -101,42 +101,9 @@ def parse_pdf(pdf_path:str)->list[str]:
101
  text.append(page.extract_text())
102
  return text
103
 
104
- # @tool
105
- # def make_rag_ressource(paths :list(str)) -> list(str):
106
- # """
107
- # Use extracted text to build a RAG tool and retreive documents to use to answer request
108
-
109
- # Args:
110
- # paths: The list of path where the file are stored
111
-
112
- # Returns:
113
- # A list of strings, where each string is the extracted text content
114
- # from the retreiver
115
- # """
116
-
117
- # pdf_files=[]
118
- # for path in paths:
119
-
120
-
121
- # pdf_documents = []
122
- # for pdf_file in pdf_files:
123
- # loader = PyPDFLoader(pdf_file)
124
- # pdf_documents.extend(loader.load())
125
- # embeddings_model = OpenAIEmbeddings()
126
- # pdf_texts = [doc.page_content for doc in pdf_documents]
127
- # return ""
128
-
129
-
130
- # # Initialize the model
131
- # model = InferenceClientModel(
132
- # model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
133
- # provider="nebius"
134
- # )
135
-
136
 
137
 
138
  # Create clinical trial search agent
139
-
140
  clinical_agent = CodeAgent(
141
  name="clinical_agent",
142
  description=(
 
101
  text.append(page.extract_text())
102
  return text
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
  # Create clinical trial search agent
 
107
  clinical_agent = CodeAgent(
108
  name="clinical_agent",
109
  description=(
tool_create_FAISS_vector.py CHANGED
@@ -1,184 +1,50 @@
 
1
  from pypdf import PdfReader
2
- import requests
3
  from io import BytesIO
4
- import serpapi
 
 
 
 
5
  import os
6
  from dotenv import load_dotenv
7
  load_dotenv()
8
 
9
- from langchain_core.documents import Document as LangchainDocument
 
 
 
10
  from metapub import FindIt
11
- import requests
12
  import xml.etree.ElementTree as ET
13
 
 
14
  from ftplib import FTP
15
  from urllib.parse import urlparse
16
- from io import BytesIO
17
-
18
- from langchain_community.retrievers import ArxivRetriever
19
 
 
20
  import arxiv
21
- import requests
22
- from io import BytesIO
23
- from pypdf import PdfReader
24
- import re
25
-
26
- from langchain_community.vectorstores.utils import DistanceStrategy
27
- from langchain_community.embeddings import HuggingFaceEmbeddings
28
- from transformers import AutoTokenizer
29
- from langchain_text_splitters import RecursiveCharacterTextSplitter
30
- from tqdm import tqdm
31
 
 
32
  import re
33
- from typing import List, Dict, Tuple
34
-
35
-
36
- def parse_pdf_file(path:str) -> str:
37
-
38
- if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
39
- response = requests.get(path)
40
- response.raise_for_status() # Ensure download succeeded
41
- reader = PdfReader(BytesIO(response.content))
42
- else:
43
- reader = PdfReader(path)
44
-
45
- text = ""
46
- for page in reader.pages:
47
- text += page.extract_text() or ""
48
-
49
- return text
50
-
51
- def get_paper_from_arxiv_id(doi: str):
52
- """
53
- Retrieve paper from arXiv using its arXiv ID.
54
- """
55
- client = arxiv.Client()
56
- search = arxiv.Search(query=doi, max_results=1)
57
- results = client.results(search)
58
- pdf_url = next(results).pdf_url
59
- text = parse_pdf_file(pdf_url)
60
- return text
61
-
62
- def get_paper_from_arxiv_id_langchain(arxiv_id: str):
63
- """
64
- Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
65
- """
66
- search = "2304.07814"
67
- retriever = ArxivRetriever(
68
- load_max_docs=2,
69
- get_full_documents=True,
70
- )
71
- docs = retriever.invoke(search)
72
- return docs
73
-
74
- def get_paper_from_pmid(pmid:str):
75
- src = FindIt(pmid)
76
- if src.url:
77
- pdf_text = parse_pdf_file(src.url)
78
- return pdf_text
79
- else:
80
- print(src.reason)
81
-
82
-
83
-
84
- def download_pdf_via_ftp(url: str) -> bytes:
85
- """
86
- Download a PDF file from an FTP URL and return its content as bytes.
87
- """
88
- parsed_url = urlparse(url)
89
- ftp_host = parsed_url.netloc
90
- ftp_path = parsed_url.path
91
-
92
- file_buffer = BytesIO()
93
-
94
- with FTP(ftp_host) as ftp:
95
- ftp.login()
96
- ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
97
-
98
- file_buffer.getvalue()
99
- file_buffer.seek(0)
100
- return file_buffer
101
-
102
-
103
- def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
104
- """
105
- Download and parse a PDF from PubMed using its PMID.
106
- """
107
- url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
108
- response = requests.get(url)
109
- cleaned_string = response.content.decode('utf-8').strip()
110
- try:
111
- root = ET.fromstring(cleaned_string)
112
- pdf_link_element = root.find(".//link[@format='pdf']")
113
- ftp_url = pdf_link_element.get('href')
114
- file_byte = download_pdf_via_ftp(ftp_url)
115
 
116
- reader = PdfReader(file_byte)
117
- text = ""
118
- for page in reader.pages:
119
- text += page.extract_text() or ""
120
- print(f"got {pmid} via ftp download")
121
- return text
122
- except Exception as e:
123
- print(e)
124
-
125
- def download_pdf_from_url(url):
126
- """
127
- Download and extract text from a PDF URL
128
- """
129
- headers = {
130
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
131
- }
132
- response = requests.get(url, headers=headers, timeout=30)
133
- response.raise_for_status()
134
- content_type = response.headers.get('content-type', '').lower()
135
- if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
136
- raise Exception(f"URL did not return a PDF (got {content_type})")
137
-
138
- reader = PdfReader(BytesIO(response.content))
139
- text = ""
140
- for page in reader.pages:
141
- text += page.extract_text() #or ""
142
- return text
143
-
144
-
145
- def download_paper_from_doi(doi):
146
- """
147
- Attempt to download paper from DOI with multiple fallback methods
148
- """
149
- # Clean DOI if it has prefix
150
- doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
151
-
152
- # Method 1: Try Unpaywall API (free, legal access)
153
- try:
154
- unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
155
- response = requests.get(unpaywall_url, timeout=10)
156
- if response.status_code == 200:
157
- data = response.json()
158
- if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
159
- pdf_url = data['best_oa_location']['url_for_pdf']
160
- text = download_pdf_from_url(pdf_url)
161
- print(f"Found PDF via Unpaywall: {pdf_url}")
162
- return text
163
- except Exception as e:
164
- print(f"Unpaywall failed: {e}")
165
-
166
- def get_pdf_content_serpapi(doi: str) -> str:
167
- """
168
- Get the link to the paper from its DOI using SerpAPI Google Scholar search.
169
- """
170
- client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
171
- results = client.search({
172
- 'engine': 'google_scholar',
173
- 'q': doi,
174
- })
175
 
176
- pdf_path = results["organic_results"][0]["link"]
177
- pdf_text = parse_pdf_file(pdf_path)
178
- return pdf_text
179
 
 
 
180
 
 
 
 
 
 
181
 
 
 
182
 
183
  class ReferenceExtractor:
184
  """Extract and classify references from LLM outputs."""
@@ -339,7 +205,7 @@ def create_vector_store_from_list_of_doi(refs :str, VECTOR_DB_PATH:str) -> str:
339
  # define embedding
340
  device = get_device()
341
 
342
- embedding_name="BAAI/bge-large-en-v1.5"
343
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
344
  model_kwargs={"device": device}, # set device acording to availaility
345
  encode_kwargs={"normalize_embeddings": True},)
 
1
+ # PDF parsing
2
  from pypdf import PdfReader
 
3
  from io import BytesIO
4
+
5
+ # HTTP requests
6
+ import requests
7
+
8
+ # Environment
9
  import os
10
  from dotenv import load_dotenv
11
  load_dotenv()
12
 
13
+ # SerpAPI DOI lookup
14
+ import serpapi
15
+
16
+ # PubMed / Metapub
17
  from metapub import FindIt
 
18
  import xml.etree.ElementTree as ET
19
 
20
+ # FTP download
21
  from ftplib import FTP
22
  from urllib.parse import urlparse
 
 
 
23
 
24
+ # ArXiv
25
  import arxiv
26
+ from langchain_community.retrievers import ArxivRetriever
 
 
 
 
 
 
 
 
 
27
 
28
+ # Regex
29
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # LangChain document
32
+ from langchain_core.documents import Document as LangchainDocument
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # Reference parser & vector store tools
35
+ from tool_create_FAISS_vector import *
 
36
 
37
+ # Torch device detection
38
+ import torch
39
 
40
+ # Embeddings & vector store dependencies
41
+ from langchain_community.vectorstores.utils import DistanceStrategy
42
+ from langchain_community.embeddings import HuggingFaceEmbeddings
43
+ from transformers import AutoTokenizer
44
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
45
 
46
+ # Progress bar
47
+ from tqdm import tqdm
48
 
49
  class ReferenceExtractor:
50
  """Extract and classify references from LLM outputs."""
 
205
  # define embedding
206
  device = get_device()
207
 
208
+ embedding_name="BAAI/bge-small-en-v1.5"
209
  embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
210
  model_kwargs={"device": device}, # set device acording to availaility
211
  encode_kwargs={"normalize_embeddings": True},)
tool_fetch_documents_DOI.py DELETED
File without changes
tool_fetch_documents_texts.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF parsing
2
+ from pypdf import PdfReader
3
+ from io import BytesIO
4
+
5
+ # HTTP requests
6
+ import requests
7
+
8
+ # XML parsing (PubMed FTP metadata)
9
+ import xml.etree.ElementTree as ET
10
+
11
+ # FTP download
12
+ from ftplib import FTP
13
+ from urllib.parse import urlparse
14
+
15
+ # ArXiv retrieval
16
+ import arxiv
17
+ from langchain_community.retrievers import ArxivRetriever
18
+
19
+ # PubMed → PDF resolution
20
+ from metapub import FindIt
21
+
22
+ # SerpAPI DOI search
23
+ import serpapi
24
+ import os
25
+ from dotenv import load_dotenv
26
+
27
+ load_dotenv()
28
+
29
+ def parse_pdf_file(path:str) -> str:
30
+
31
+ if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
32
+ response = requests.get(path)
33
+ response.raise_for_status() # Ensure download succeeded
34
+ reader = PdfReader(BytesIO(response.content))
35
+ else:
36
+ reader = PdfReader(path)
37
+
38
+ text = ""
39
+ for page in reader.pages:
40
+ text += page.extract_text() or ""
41
+
42
+ return text
43
+
44
+ def get_paper_from_arxiv_id(doi: str):
45
+ """
46
+ Retrieve paper from arXiv using its arXiv ID.
47
+ """
48
+ client = arxiv.Client()
49
+ search = arxiv.Search(query=doi, max_results=1)
50
+ results = client.results(search)
51
+ pdf_url = next(results).pdf_url
52
+ text = parse_pdf_file(pdf_url)
53
+ return text
54
+
55
+ def get_paper_from_arxiv_id_langchain(arxiv_id: str):
56
+ """
57
+ Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
58
+ """
59
+ search = "2304.07814"
60
+ retriever = ArxivRetriever(
61
+ load_max_docs=2,
62
+ get_full_documents=True,
63
+ )
64
+ docs = retriever.invoke(search)
65
+ return docs
66
+
67
+ def get_paper_from_pmid(pmid:str):
68
+ src = FindIt(pmid)
69
+ if src.url:
70
+ pdf_text = parse_pdf_file(src.url)
71
+ return pdf_text
72
+ else:
73
+ print(src.reason)
74
+
75
+
76
+
77
+ def download_pdf_via_ftp(url: str) -> bytes:
78
+ """
79
+ Download a PDF file from an FTP URL and return its content as bytes.
80
+ """
81
+ parsed_url = urlparse(url)
82
+ ftp_host = parsed_url.netloc
83
+ ftp_path = parsed_url.path
84
+
85
+ file_buffer = BytesIO()
86
+
87
+ with FTP(ftp_host) as ftp:
88
+ ftp.login()
89
+ ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
90
+
91
+ file_buffer.getvalue()
92
+ file_buffer.seek(0)
93
+ return file_buffer
94
+
95
+
96
+ def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
97
+ """
98
+ Download and parse a PDF from PubMed using its PMID.
99
+ """
100
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
101
+ response = requests.get(url)
102
+ cleaned_string = response.content.decode('utf-8').strip()
103
+ try:
104
+ root = ET.fromstring(cleaned_string)
105
+ pdf_link_element = root.find(".//link[@format='pdf']")
106
+ ftp_url = pdf_link_element.get('href')
107
+ file_byte = download_pdf_via_ftp(ftp_url)
108
+
109
+ reader = PdfReader(file_byte)
110
+ text = ""
111
+ for page in reader.pages:
112
+ text += page.extract_text() or ""
113
+ print(f"got {pmid} via ftp download")
114
+ return text
115
+ except Exception as e:
116
+ print(e)
117
+
118
+ def download_pdf_from_url(url):
119
+ """
120
+ Download and extract text from a PDF URL
121
+ """
122
+ headers = {
123
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
124
+ }
125
+ response = requests.get(url, headers=headers, timeout=30)
126
+ response.raise_for_status()
127
+ content_type = response.headers.get('content-type', '').lower()
128
+ if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
129
+ raise Exception(f"URL did not return a PDF (got {content_type})")
130
+
131
+ reader = PdfReader(BytesIO(response.content))
132
+ text = ""
133
+ for page in reader.pages:
134
+ text += page.extract_text() #or ""
135
+ return text
136
+
137
+
138
+ def download_paper_from_doi(doi):
139
+ """
140
+ Attempt to download paper from DOI with multiple fallback methods
141
+ """
142
+ # Clean DOI if it has prefix
143
+ doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
144
+
145
+ # Method 1: Try Unpaywall API (free, legal access)
146
+ try:
147
+ unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
148
+ response = requests.get(unpaywall_url, timeout=10)
149
+ if response.status_code == 200:
150
+ data = response.json()
151
+ if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
152
+ pdf_url = data['best_oa_location']['url_for_pdf']
153
+ text = download_pdf_from_url(pdf_url)
154
+ print(f"Found PDF via Unpaywall: {pdf_url}")
155
+ return text
156
+ except Exception as e:
157
+ print(f"Unpaywall failed: {e}")
158
+
159
+ def get_pdf_content_serpapi(doi: str) -> str:
160
+ """
161
+ Get the link to the paper from its DOI using SerpAPI Google Scholar search.
162
+ """
163
+ client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
164
+ results = client.search({
165
+ 'engine': 'google_scholar',
166
+ 'q': doi,
167
+ })
168
+
169
+ pdf_path = results["organic_results"][0]["link"]
170
+ pdf_text = parse_pdf_file(pdf_path)
171
+ return pdf_text
172
+