linh-hk commited on
Commit
a96bcc0
·
0 Parent(s):

First version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +25 -0
  3. app.py +214 -0
  4. core/DefaultPackages/__init__.py +4 -0
  5. core/DefaultPackages/__pycache__/__init__.cpython-310.pyc +0 -0
  6. core/DefaultPackages/__pycache__/__init__.cpython-311.pyc +0 -0
  7. core/DefaultPackages/__pycache__/openFile.cpython-310.pyc +0 -0
  8. core/DefaultPackages/__pycache__/openFile.cpython-311.pyc +0 -0
  9. core/DefaultPackages/__pycache__/saveFile.cpython-310.pyc +0 -0
  10. core/DefaultPackages/__pycache__/saveFile.cpython-311.pyc +0 -0
  11. core/DefaultPackages/openFile.py +12 -0
  12. core/DefaultPackages/saveFile.py +11 -0
  13. core/NER/PDF/__pycache__/pdf.cpython-310.pyc +0 -0
  14. core/NER/PDF/__pycache__/pdf.cpython-311.pyc +0 -0
  15. core/NER/PDF/pdf.py +193 -0
  16. core/NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc +0 -0
  17. core/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc +0 -0
  18. core/NER/WordDoc/wordDoc.py +178 -0
  19. core/NER/__pycache__/cleanText.cpython-310.pyc +0 -0
  20. core/NER/__pycache__/cleanText.cpython-311.pyc +0 -0
  21. core/NER/cleanText.py +115 -0
  22. core/NER/html/__pycache__/extractHTML.cpython-310.pyc +0 -0
  23. core/NER/html/__pycache__/extractHTML.cpython-311.pyc +0 -0
  24. core/NER/html/extractHTML.py +226 -0
  25. core/NER/word2Vec/__pycache__/word2vec.cpython-310.pyc +0 -0
  26. core/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc +0 -0
  27. core/NER/word2Vec/heuristic.py +52 -0
  28. core/NER/word2Vec/testModel/test_model.model +0 -0
  29. core/NER/word2Vec/testModel/test_model.txt +25 -0
  30. core/NER/word2Vec/testModel/test_model_updated.model +0 -0
  31. core/NER/word2Vec/word2vec.py +436 -0
  32. core/__pycache__/data_preprocess.cpython-310.pyc +0 -0
  33. core/__pycache__/drive_utils.cpython-310.pyc +0 -0
  34. core/__pycache__/model.cpython-310.pyc +0 -0
  35. core/__pycache__/mtdna_backend.cpython-310.pyc +0 -0
  36. core/__pycache__/mtdna_classifier.cpython-310.pyc +0 -0
  37. core/__pycache__/pipeline.cpython-310.pyc +0 -0
  38. core/__pycache__/smart_fallback.cpython-310.pyc +0 -0
  39. core/__pycache__/standardize_location.cpython-310.pyc +0 -0
  40. core/__pycache__/upgradeClassify.cpython-310.pyc +0 -0
  41. core/data_preprocess.py +744 -0
  42. core/drive_utils.py +138 -0
  43. core/model.py +1414 -0
  44. core/mtdna_backend.py +426 -0
  45. core/mtdna_classifier.py +764 -0
  46. core/pipeline.py +793 -0
  47. core/smart_fallback.py +259 -0
  48. core/standardize_location.py +90 -0
  49. core/upgradeClassify.py +276 -0
  50. env.yaml +8 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ static/images/flowchart.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a small Python base image
2
+ FROM python:3.10
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ # Fast, clean installs
7
+ ENV PYTHONDONTWRITEBYTECODE=1 \
8
+ PYTHONUNBUFFERED=1 \
9
+ PIP_NO_CACHE_DIR=1 \
10
+ PORT=7860
11
+
12
+ WORKDIR /app
13
+
14
+ # Install Python deps
15
+ COPY --chown=user ./requirements.txt requirements.txt
16
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
17
+
18
+ # Copy your app
19
+ COPY --chown=user . /app
20
+
21
+ # Expose the port Hugging Face expects
22
+ EXPOSE 7860
23
+
24
+ # Run your app (it will read $PORT below)
25
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, uuid, time, os
2
+ import flask
3
+ from flask_socketio import SocketIO, emit, join_room, leave_room
4
+ import eventlet
5
+ from core.mtdna_backend import *
6
+
7
+ # accessions = []
8
+ isvip = True # or True depending on the user
9
+
10
+ app = flask.Flask(__name__)
11
+ app.config["DEBUG"] = True
12
+ app.config["SECRET_KEY"] = "dev-key"
13
+ socketio = SocketIO(app, async_mode="eventlet", cors_allowed_origins="*")
14
+
15
+ # --- job registry for cancel flags ---
16
+ # Use a simple boolean flag in eventlet mode: True => cancel requested
17
+ CANCEL_FLAGS = {} # {job_id: bool}
18
+ JOBS = {} # {job_id: {"accs": [...], "started": False}}
19
+
20
+ # Home
21
+ @app.route("/")
22
+ def home():
23
+ return flask.render_template("Home.html", isvip=isvip)
24
+
25
+ # Submit route
26
+ @app.route("/submit", methods=["POST"])
27
+ def submit():
28
+ raw_text = flask.request.form.get("raw_text", "").strip()
29
+ file_upload = flask.request.files.get("file_upload")
30
+ user_email = flask.request.form.get("user_email", "").strip()
31
+
32
+ if file_upload and getattr(file_upload, "filename", ""):
33
+ data = file_upload.read()
34
+ buf = io.BytesIO(data); buf.name = file_upload.filename
35
+ file_upload = buf
36
+
37
+ accs, error = extract_accessions_from_input(file=file_upload, raw_text=raw_text)
38
+
39
+ job_id = uuid.uuid4().hex[:8]
40
+ CANCEL_FLAGS[job_id] = False
41
+
42
+ # Obtain user's past usage
43
+ user_hash = hash_user_id(user_email)
44
+ user_usage, max_allowed = increment_usage(user_hash, 0) # get how much they have run and the maximun #queries they have
45
+ remaining_trials = max(max_allowed - user_usage, 0) # remaining trials if everything goes well
46
+ total_queries = max(0, min(len(accs), max_allowed - user_usage)) # limited the number of queries of users so that won't have to run all.
47
+
48
+ # the list of IDs that will be run within allowance
49
+ accs = accs[:total_queries]
50
+
51
+ # Save var to the global environment
52
+ JOBS[job_id] = {"accs": accs,
53
+ "user": {
54
+ "user_hash": user_hash,
55
+ "user_usage": user_usage,
56
+ "max_allowed": max_allowed,
57
+ "total_queries": total_queries,
58
+ "remaining_trials": remaining_trials
59
+ },
60
+ "started": False}
61
+
62
+ return flask.redirect(flask.url_for("output", job_id=job_id))
63
+
64
+ # Output page (must accept job_id!)
65
+ @app.route("/output/<job_id>")
66
+ def output(job_id):
67
+ started_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
68
+ total_queries = JOBS[job_id]['user']['total_queries']
69
+ return flask.render_template(
70
+ "Output.html",
71
+ job_id=job_id,
72
+ started_at=started_at,
73
+ total_queries=total_queries,
74
+ isvip=isvip,
75
+ ws_url="", # leave empty for mock/demo mode; set later to real WS URL
76
+ )
77
+
78
+ # Functions that communicates between web and server - run via socketio
79
+ def run_job(job_id, accessions):
80
+ total_queries = JOBS[job_id]['user']['total_queries']
81
+ user_hash = JOBS[job_id]['user']['user_hash'] # to update allowance in case cancelled
82
+ room = job_id
83
+
84
+
85
+ def send_log(msg): socketio.emit("log", {"msg": msg}, room=room)
86
+ def send_row(row): socketio.emit("row", row, room=room)
87
+
88
+
89
+ try:
90
+ socketio.emit("status", {"state": "started", "total": len(accessions)}, room=room)
91
+ start_time = time.perf_counter()
92
+ send_log(f"Job {job_id} started. {total_queries} accession(s).")
93
+
94
+ outs = []
95
+ for i, acc in enumerate(accessions, 1):
96
+ if CANCEL_FLAGS.get(job_id):
97
+ send_log("Cancellation requested. Stopping…")
98
+ socketio.emit("status", {"state": "cancelled"}, room=room)
99
+ increment_usage(user_hash, i - 1)
100
+ return
101
+
102
+ t0 = time.perf_counter()
103
+ out = summarize_results(acc) # may be dict / [] / None
104
+ dt = time.perf_counter() - t0
105
+
106
+ # ---- normalise 'out' to a dict we can emit safely ----
107
+ if not out:
108
+ out = {}
109
+ elif isinstance(out, list):
110
+ # If a list slipped through, try to coerce sensibly
111
+ if out and isinstance(out[0], dict):
112
+ out = out[0]
113
+ elif out and isinstance(out[0], list):
114
+ # very defensive: list-of-lists -> map by expected order
115
+ keys = [
116
+ "Sample ID", "Predicted Country", "Country Explanation",
117
+ "Predicted Sample Type", "Sample Type Explanation",
118
+ "Sources", "Time cost"
119
+ ]
120
+ row0 = out[0]
121
+ out = {k: (row0[idx] if idx < len(row0) else "") for idx, k in enumerate(keys)}
122
+ else:
123
+ out = {}
124
+ elif not isinstance(out, dict):
125
+ out = {}
126
+
127
+ # ---- map backend keys (Title Case) to frontend keys (snake_case) ----
128
+ sample_id = out.get("Sample ID") or str(acc)
129
+ predicted_country = out.get("Predicted Country", "unknown")
130
+ country_explanation = out.get("Country Explanation", "unknown")
131
+ predicted_sample_type = out.get("Predicted Sample Type", "unknown")
132
+ sample_type_explanation = out.get("Sample Type Explanation", "unknown")
133
+ sources = out.get("Sources", "No Links")
134
+ time_cost = out.get("Time cost") or f"{dt:.2f}s"
135
+
136
+ send_row = {
137
+ "idx": i,
138
+ "sample_id": sample_id,
139
+ "predicted_country": predicted_country,
140
+ "country_explanation": country_explanation,
141
+ "predicted_sample_type": predicted_sample_type,
142
+ "sample_type_explanation": sample_type_explanation,
143
+ "sources": sources,
144
+ "time_cost": time_cost,
145
+ }
146
+
147
+ socketio.emit("row", send_row, room=room)
148
+ socketio.sleep(0) # <- correct spelling; yield so the emit flushes
149
+ send_log(f"Processed {acc} in {dt:.2f}s")
150
+
151
+ total_dt = time.perf_counter() - start_time
152
+
153
+ # Update user allowance
154
+ increment_usage(user_hash, total_queries)
155
+ # Calculate remaining_trials to display for user
156
+ remaining_trials = JOBS[job_id]['user']['remaining_trials']
157
+
158
+ socketio.emit("status", {"state": "finished", "elapsed": f"{total_dt:.2f}s"}, room=room)
159
+ send_log(f"Job finished successfully. Number of trials left is")
160
+ except Exception as e:
161
+ send_log(f"ERROR: {e}")
162
+ socketio.emit("status", {"state": "error", "message": str(e)}, room=room)
163
+ finally:
164
+ CANCEL_FLAGS.pop(job_id, None)
165
+ JOBS.pop(job_id, None) # <— tidy queued job
166
+
167
+ # ---- Socket.IO events ----
168
+ @socketio.on("connect")
169
+ def on_connect():
170
+ emit("connected", {"ok": True})
171
+
172
+ @socketio.on("join")
173
+ def on_join(data):
174
+ job_id = data.get("job_id")
175
+ if job_id:
176
+ join_room(job_id)
177
+ emit("joined", {"room": job_id})
178
+
179
+ # Start the job once the client is in the room
180
+ job = JOBS.get(job_id)
181
+ if job and not job["started"]:
182
+ job["started"] = True
183
+ total = len(job["accs"])
184
+ # Send an initial queued/total status so the UI can set progress denominator
185
+ socketio.emit("status", {"state": "queued", "total": total}, room=job_id)
186
+ socketio.start_background_task(run_job, job_id, job["accs"])
187
+
188
+ @socketio.on("leave")
189
+ def on_leave(data):
190
+ job_id = data.get("job_id")
191
+ if job_id:
192
+ leave_room(job_id)
193
+
194
+ @socketio.on("cancel")
195
+ def on_cancel(data):
196
+ job_id = data.get("job_id")
197
+ if job_id in CANCEL_FLAGS:
198
+ CANCEL_FLAGS[job_id] = True # flip the flag
199
+ emit("status", {"state": "cancelling"}, room=job_id)
200
+
201
+ @app.route("/about")
202
+ def about():
203
+ return flask.render_template("About.html", isvip=isvip)
204
+
205
+ @app.route("/pricing")
206
+ def pricing():
207
+ return flask.render_template("Pricing.html", isvip=isvip)
208
+
209
+ @app.route("/contact")
210
+ def contact():
211
+ return flask.render_template("Contact.html", isvip=isvip)
212
+
213
+ port = int(os.environ.get("PORT", 7860)) # HF Spaces injects PORT
214
+ socketio.run(app, host="0.0.0.0", port=port)
core/DefaultPackages/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __all__ = [
2
+ 'openFile',
3
+ 'saveFile',
4
+ ]
core/DefaultPackages/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (208 Bytes). View file
 
core/DefaultPackages/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (230 Bytes). View file
 
core/DefaultPackages/__pycache__/openFile.cpython-310.pyc ADDED
Binary file (581 Bytes). View file
 
core/DefaultPackages/__pycache__/openFile.cpython-311.pyc ADDED
Binary file (1.01 kB). View file
 
core/DefaultPackages/__pycache__/saveFile.cpython-310.pyc ADDED
Binary file (605 Bytes). View file
 
core/DefaultPackages/__pycache__/saveFile.cpython-311.pyc ADDED
Binary file (1.03 kB). View file
 
core/DefaultPackages/openFile.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def openFile(file):
2
+ with open(file) as f:
3
+ openFile = f.read()
4
+ return openFile
5
+
6
+ def openJsonFile(file):
7
+ import json
8
+ # Opening JSON file
9
+ with open(file, 'r') as openfile:
10
+ # Reading from json file
11
+ json_object = json.load(openfile)
12
+ return json_object
core/DefaultPackages/saveFile.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ def saveFile(name,content):
3
+ Name = name
4
+ fi = open(Name, "a")
5
+ # Add new change in to saved file
6
+ with open(Name, "w") as external_file:
7
+ add_text = content
8
+ print(add_text, file=external_file)
9
+ external_file.close()
10
+ def saveJsonFile(name,content):
11
+ saveFile(name,json.dumps(content))
core/NER/PDF/__pycache__/pdf.cpython-310.pyc ADDED
Binary file (6.12 kB). View file
 
core/NER/PDF/__pycache__/pdf.cpython-311.pyc ADDED
Binary file (11.3 kB). View file
 
core/NER/PDF/pdf.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!pip install pdfreader
2
+ import pdfreader
3
+ from pdfreader import PDFDocument, SimplePDFViewer
4
+ #!pip install bs4
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+ from core.NER import cleanText
8
+ #!pip install tabula-py
9
+ import tabula
10
+ import fitz # PyMuPDF
11
+ import os
12
+
13
+ class PDF():
14
+ def __init__(self, pdf, saveFolder, doi=None):
15
+ self.pdf = pdf
16
+ self.doi = doi
17
+ self.saveFolder = saveFolder
18
+
19
+ def openPDFFile(self):
20
+ if "https" in self.pdf:
21
+ name = self.pdf.split("/")[-1]
22
+ name = self.downloadPDF(self.saveFolder)
23
+ if name != "no pdfLink to download":
24
+ fileToOpen = os.path.join(self.saveFolder, name)
25
+ else:
26
+ fileToOpen = self.pdf
27
+ else:
28
+ fileToOpen = self.pdf
29
+ return open(fileToOpen, "rb")
30
+
31
+ def downloadPDF(self, saveFolder):
32
+ pdfLink = ''
33
+ if ".pdf" not in self.pdf and "https" not in self.pdf:
34
+ r = requests.get(self.pdf)
35
+ soup = BeautifulSoup(r.content, 'html.parser')
36
+ links = soup.find_all("a")
37
+ for link in links:
38
+ if ".pdf" in link.get("href", ""):
39
+ if self.doi in link.get("href"):
40
+ pdfLink = link.get("href")
41
+ break
42
+ else:
43
+ pdfLink = self.pdf
44
+
45
+ if pdfLink != '':
46
+ response = requests.get(pdfLink)
47
+ name = pdfLink.split("/")[-1]
48
+ print("inside download PDF and name and link are: ", pdfLink, name)
49
+ print("saveFolder is: ", saveFolder)
50
+ with open(os.path.join(saveFolder, name), 'wb') as pdf:
51
+ print("len of response content: ", len(response.content))
52
+ pdf.write(response.content)
53
+ print("pdf downloaded")
54
+ return name
55
+ else:
56
+ return "no pdfLink to download"
57
+
58
+ def extractText(self):
59
+ try:
60
+ fileToOpen = self.openPDFFile().name
61
+ try:
62
+ doc = fitz.open(fileToOpen)
63
+ text = ""
64
+ for page in doc:
65
+ text += page.get_text("text") + "\n\n"
66
+ doc.close()
67
+
68
+ if len(text.strip()) < 100:
69
+ print("Fallback to PDFReader due to weak text extraction.")
70
+ text = self.extractTextWithPDFReader()
71
+ return text
72
+ except Exception as e:
73
+ print("Failed with PyMuPDF, fallback to PDFReader:", e)
74
+ return self.extractTextWithPDFReader()
75
+ except:
76
+ return ""
77
+ def extract_text_excluding_tables(self):
78
+ fileToOpen = self.openPDFFile().name
79
+ text = ""
80
+ try:
81
+ doc = fitz.open(fileToOpen)
82
+ for page in doc:
83
+ blocks = page.get_text("dict")["blocks"]
84
+
85
+ for block in blocks:
86
+ if block["type"] == 0: # text block
87
+ lines = block.get("lines", [])
88
+
89
+ if not lines:
90
+ continue
91
+ avg_words_per_line = sum(len(l["spans"]) for l in lines) / len(lines)
92
+ if avg_words_per_line > 1: # Heuristic: paragraph-like blocks
93
+ for line in lines:
94
+ text += " ".join(span["text"] for span in line["spans"]) + "\n"
95
+ doc.close()
96
+ if len(text.strip()) < 100:
97
+ print("Fallback to PDFReader due to weak text extraction.")
98
+ text = self.extractTextWithPDFReader()
99
+ return text
100
+ except Exception as e:
101
+ print("Failed with PyMuPDF, fallback to PDFReader:", e)
102
+ return self.extractTextWithPDFReader()
103
+
104
+ def extractTextWithPDFReader(self):
105
+ jsonPage = {}
106
+ try:
107
+ pdf = self.openPDFFile()
108
+ print("open pdf file")
109
+ print(pdf)
110
+ doc = PDFDocument(pdf)
111
+ viewer = SimplePDFViewer(pdf)
112
+ all_pages = [p for p in doc.pages()]
113
+ cl = cleanText.cleanGenText()
114
+ pdfText = ""
115
+ for page in range(1, len(all_pages)):
116
+ viewer.navigate(page)
117
+ viewer.render()
118
+ if str(page) not in jsonPage:
119
+ jsonPage[str(page)] = {}
120
+ text = "".join(viewer.canvas.strings)
121
+ clean, filteredWord = cl.textPreprocessing(text)
122
+ jsonPage[str(page)]["normalText"] = [text]
123
+ jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
124
+ jsonPage[str(page)]["image"] = [viewer.canvas.images]
125
+ jsonPage[str(page)]["form"] = [viewer.canvas.forms]
126
+ jsonPage[str(page)]["content"] = [viewer.canvas.text_content]
127
+ jsonPage[str(page)]["inline_image"] = [viewer.canvas.inline_images]
128
+ pdf.close()
129
+ except:
130
+ jsonPage = {}
131
+ return self.mergeTextinJson(jsonPage)
132
+
133
+ def extractTable(self,pages="all",saveFile=None,outputFormat=None):
134
+ '''pages (str, int, iterable of int, optional) –
135
+ An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
136
+ Examples: '1-2,3', 'all', [1,2]'''
137
+ df = []
138
+ if "https" in self.pdf:
139
+ name = self.pdf.split("/")[-1]
140
+ name = self.downloadPDF(self.saveFolder)
141
+ if name != "no pdfLink to download":
142
+ fileToOpen = self.saveFolder + "/" + name
143
+ else: fileToOpen = self.pdf
144
+ else: fileToOpen = self.pdf
145
+ try:
146
+ df = tabula.read_pdf(fileToOpen, pages=pages)
147
+ # saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv"
148
+ # outputFormat: "csv"
149
+ #tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages)
150
+ except:# ValueError:
151
+ df = []
152
+ print("No tables found in PDF file")
153
+ return df
154
+
155
+ def mergeTextinJson(self, jsonPDF):
156
+ try:
157
+ cl = cleanText.cleanGenText()
158
+ pdfText = ""
159
+ if jsonPDF:
160
+ for page in jsonPDF:
161
+ if len(jsonPDF[page]["normalText"]) > 0:
162
+ for i in range(len(jsonPDF[page]["normalText"])):
163
+ text = jsonPDF[page]["normalText"][i]
164
+ if len(text) > 0:
165
+ text = cl.removeTabWhiteSpaceNewLine(text)
166
+ text = cl.removeExtraSpaceBetweenWords(text)
167
+ jsonPDF[page]["normalText"][i] = text
168
+ if i - 1 > 0:
169
+ if jsonPDF[page]["normalText"][i - 1][-1] != ".":
170
+ pdfText += ". "
171
+ pdfText += jsonPDF[page]["normalText"][i]
172
+ if len(jsonPDF[page]["normalText"][i]) > 0:
173
+ if jsonPDF[page]["normalText"][i][-1] != ".":
174
+ pdfText += "."
175
+ pdfText += "\n\n"
176
+ return pdfText
177
+ except:
178
+ return ""
179
+
180
+ def getReference(self):
181
+ pass
182
+
183
+ def getSupMaterial(self):
184
+ pass
185
+
186
+ def removeHeaders(self):
187
+ pass
188
+
189
+ def removeFooters(self):
190
+ pass
191
+
192
+ def removeReference(self):
193
+ pass
core/NER/WordDoc/__pycache__/wordDoc.cpython-310.pyc ADDED
Binary file (4.59 kB). View file
 
core/NER/WordDoc/__pycache__/wordDoc.cpython-311.pyc ADDED
Binary file (11.1 kB). View file
 
core/NER/WordDoc/wordDoc.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! pip install spire.doc
2
+ #! pip install Spire.XLS
3
+ import pandas as pd
4
+ from spire.doc import *
5
+ from spire.doc.common import *
6
+ from spire.xls import *
7
+ from spire.xls.common import *
8
+ from core.NER import cleanText
9
+ import requests
10
+ class wordDoc(): # using python-docx
11
+ def __init__(self, wordDoc,saveFolder):
12
+ self.wordDoc = wordDoc
13
+ self.saveFolder = saveFolder
14
+ def openFile(self):
15
+ document = Document()
16
+ return document.LoadFromFile(self.wordDoc)
17
+ def extractTextByPage(self):
18
+ # reference: https://medium.com/@alice.yang_10652/extract-text-from-word-documents-with-python-a-comprehensive-guide-95a67e23c35c#:~:text=containing%20specific%20content.-,Spire.,each%20paragraph%20using%20the%20Paragraph.
19
+ json = {}
20
+ #doc = self.openFile()
21
+ # Create an object of the FixedLayoutDocument class and pass the Document object to the class constructor as a parameter
22
+ try:
23
+ doc = Document()
24
+ doc.LoadFromFile(self.wordDoc)
25
+ except:
26
+ response = requests.get(self.wordDoc)
27
+ name = self.wordDoc.split("/")[-1]
28
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
29
+ temp_file.write(response.content)
30
+ doc = Document()
31
+ doc.LoadFromFile(self.saveFolder+"/" + name)
32
+ text = doc.GetText()
33
+ return text
34
+ def extractTableAsText(self):
35
+ getDoc = ''
36
+ try:
37
+ # reference:
38
+ # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
39
+ doc = Document()
40
+ doc.LoadFromFile(self.wordDoc)
41
+ getDoc = "have document"
42
+ except:
43
+ response = requests.get(self.wordDoc)
44
+ name = self.wordDoc.split("/")[-1]
45
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
46
+ temp_file.write(response.content)
47
+ doc = Document()
48
+ doc.LoadFromFile(self.saveFolder+"/" + name)
49
+ getDoc = "have document"
50
+ json = {}
51
+ if len(getDoc) > 0:
52
+ # Loop through the sections
53
+ for s in range(doc.Sections.Count):
54
+ # Get a section
55
+ section = doc.Sections.get_Item(s)
56
+ # Get the tables in the section
57
+ json["Section" + str(s)] = {}
58
+ tables = section.Tables
59
+ # Loop through the tables
60
+ for i in range(0, tables.Count):
61
+ # Get a table
62
+ table = tables.get_Item(i)
63
+ # Initialize a string to store the table data
64
+ tableData = ''
65
+ # Loop through the rows of the table
66
+ for j in range(0, table.Rows.Count):
67
+ # Loop through the cells of the row
68
+ for k in range(0, table.Rows.get_Item(j).Cells.Count):
69
+ # Get a cell
70
+ cell = table.Rows.get_Item(j).Cells.get_Item(k)
71
+ # Get the text in the cell
72
+ cellText = ''
73
+ for para in range(cell.Paragraphs.Count):
74
+ paragraphText = cell.Paragraphs.get_Item(para).Text
75
+ cellText += (paragraphText + ' ')
76
+ # Add the text to the string
77
+ tableData += cellText
78
+ if k < table.Rows.get_Item(j).Cells.Count - 1:
79
+ tableData += '\t'
80
+ # Add a new line
81
+ tableData += '\n'
82
+ json["Section" + str(s)]["Table"+str(i)] = tableData
83
+ return json
84
+ def extractTableAsList(self):
85
+ tables = []
86
+ try:
87
+ doc = Document()
88
+ doc.LoadFromFile(self.wordDoc)
89
+ except:
90
+ response = requests.get(self.wordDoc)
91
+ name = self.wordDoc.split("/")[-1]
92
+ with open(os.path.join(self.saveFolder, name), "wb") as f:
93
+ f.write(response.content)
94
+ doc = Document()
95
+ doc.LoadFromFile(os.path.join(self.saveFolder, name))
96
+
97
+ for s in range(doc.Sections.Count):
98
+ section = doc.Sections.get_Item(s)
99
+ for i in range(section.Tables.Count):
100
+ table = section.Tables.get_Item(i)
101
+ table_data = []
102
+ for row in range(table.Rows.Count):
103
+ row_data = []
104
+ for cell in range(table.Rows.get_Item(row).Cells.Count):
105
+ cell_obj = table.Rows.get_Item(row).Cells.get_Item(cell)
106
+ cell_text = ""
107
+ for p in range(cell_obj.Paragraphs.Count):
108
+ cell_text += cell_obj.Paragraphs.get_Item(p).Text.strip() + " "
109
+ row_data.append(cell_text.strip())
110
+ table_data.append(row_data)
111
+ tables.append(table_data)
112
+ return tables
113
+ def extractTableAsExcel(self):
114
+ getDoc = ''
115
+ try:
116
+ # reference:
117
+ # https://www.e-iceblue.com/Tutorials/Python/Spire.Doc-for-Python/Program-Guide/Table/Python-Extract-Tables-from-Word-Documents.html?gad_source=1&gclid=Cj0KCQiA6Ou5BhCrARIsAPoTxrCj3XSsQsDziwqE8BmVlOs12KneOlvtKnn5YsDruxK_2T_UUhjw6NYaAtJhEALw_wcB
118
+ doc = Document()
119
+ doc.LoadFromFile(self.wordDoc)
120
+ getDoc = "have document"
121
+ except:
122
+ response = requests.get(self.wordDoc)
123
+ name = self.wordDoc.split("/")[-1]
124
+ with open(self.saveFolder+"/" + name, "wb") as temp_file: # Create a temporary file to store the downloaded data
125
+ temp_file.write(response.content)
126
+ doc = Document()
127
+ doc.LoadFromFile(self.saveFolder+"/" + name)
128
+ getDoc = "have document"
129
+ if len(getDoc) > 0:
130
+ try:
131
+ # Create an instance of Workbook
132
+ wb = Workbook()
133
+ wb.Worksheets.Clear()
134
+
135
+ # Loop through sections in the document
136
+ for i in range(doc.Sections.Count):
137
+ # Get a section
138
+ section = doc.Sections.get_Item(i)
139
+ # Loop through tables in the section
140
+ for j in range(section.Tables.Count):
141
+ # Get a table
142
+ table = section.Tables.get_Item(j)
143
+ # Create a worksheet
144
+ ws = wb.Worksheets.Add(f'Table_{i+1}_{j+1}')
145
+ # Write the table to the worksheet
146
+ for row in range(table.Rows.Count):
147
+ # Get a row
148
+ tableRow = table.Rows.get_Item(row)
149
+ # Loop through cells in the row
150
+ for cell in range(tableRow.Cells.Count):
151
+ # Get a cell
152
+ tableCell = tableRow.Cells.get_Item(cell)
153
+ # Get the text in the cell
154
+ cellText = ''
155
+ for paragraph in range(tableCell.Paragraphs.Count):
156
+ paragraph = tableCell.Paragraphs.get_Item(paragraph)
157
+ cellText = cellText + (paragraph.Text + ' ')
158
+ # Write the cell text to the worksheet
159
+ ws.SetCellValue(row + 1, cell + 1, cellText)
160
+
161
+ # Save the workbook
162
+ name = self.wordDoc.split("/")[-1]
163
+ if self.saveFolder == None:
164
+ wb.SaveToFile('/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx", FileFormat.Version2016)
165
+ nameFile = '/content/drive/MyDrive/CollectData/NER/excel/TestExamples/output/'+name+".xlsx"
166
+ else:
167
+ wb.SaveToFile(self.saveFolder+'/'+name+".xlsx", FileFormat.Version2016)
168
+ nameFile = self.saveFolder+'/'+name + ".xlsx"
169
+ doc.Close()
170
+ wb.Dispose()
171
+ return nameFile
172
+ except: return "No table found on word doc"
173
+ else:
174
+ return "No table found on word doc"
175
+ def getReference(self):
176
+ pass
177
+ def getSupMaterial(self):
178
+ pass
core/NER/__pycache__/cleanText.cpython-310.pyc ADDED
Binary file (3.44 kB). View file
 
core/NER/__pycache__/cleanText.cpython-311.pyc ADDED
Binary file (5.89 kB). View file
 
core/NER/cleanText.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference:
2
+ # https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
3
+ import re, json
4
+ import nltk
5
+ #nltk.download('stopwords')
6
+ #nltk.download()
7
+ from core.DefaultPackages import openFile, saveFile
8
+ from nltk.corpus import stopwords
9
+ from nltk.corpus.reader.api import wordpunct_tokenize
10
+ from nltk.tokenize import word_tokenize
11
+ #from wordsegment import load, segment
12
+ from wordsegment import load, segment
13
+ class cleanGenText():
14
+ def __init__(self):
15
+ #self.text = text
16
+ load()
17
+ pass
18
+ def removePunct(self,text,KeepPeriod=False):
19
+ punctuation = r'[^\w\s]'
20
+ if KeepPeriod==True:
21
+ punctuation = r'[^\w\s\.]'
22
+ return re.sub(punctuation, '', text)
23
+ def removeURL(self,text):
24
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
25
+ return url_pattern.sub(r'', text)
26
+ def removeHTMLTag(self,text):
27
+ html_tags_pattern = r'<.*?>'
28
+ return re.sub(html_tags_pattern, '', text)
29
+ def removeTabWhiteSpaceNewLine(self,text):
30
+ # remove \n or \t and unnecessary white space
31
+ cleanText = text.replace("\n\n","")
32
+ cleanText = text.replace("\n","")
33
+ cleanText = cleanText.replace("\t","")
34
+ cleanText = cleanText.strip()
35
+ return cleanText
36
+ def removeExtraSpaceBetweenWords(self,text):
37
+ return re.sub(r'\s+', ' ',text).strip()
38
+ def removeStopWords(self,text):
39
+ #extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
40
+ filteredWord = []
41
+ stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
42
+ textWords = word_tokenize(text)
43
+ for word in textWords:
44
+ if word.lower() not in stopWords:
45
+ filteredWord.append(word) # and w.isalpha()==True]
46
+ return filteredWord
47
+ def removeLowercaseBetweenUppercase(self,segment):
48
+ # segment such as "Myanmar (formerly Burma)"
49
+ # but not change anything for "Viet Nam"
50
+ # for special cases:
51
+ # the capital letter:
52
+ # When there is a lowercase word between:
53
+ # e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"
54
+ # When there is no lowercase word or uppercase words in a row:
55
+ # e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
56
+ outputUp = []
57
+ segment = self.removeTabWhiteSpaceNewLine(segment)
58
+ segments = segment.split(" ")
59
+ for w in range(len(segments)):
60
+ word = segments[w]
61
+ cleanWord = self.removePunct(word)
62
+ cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
63
+ prevWord = ""
64
+ if w > 0:
65
+ prevWord = segments[w-1]
66
+ cleanPreWord = self.removePunct(prevWord)
67
+ cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
68
+ if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
69
+ if len(prevWord)>0 and prevWord[0].isupper() == True:
70
+ outputUp[-1] += " " + cleanWord
71
+ else:
72
+ outputUp.append(cleanWord)
73
+ return outputUp
74
+ def textPreprocessing(self, text, keepPeriod=False):
75
+ # lowercase
76
+ #lowerText = self.text.lower()
77
+ # remove punctuation & special characacters
78
+ cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
79
+ # removal of URLs in text
80
+ cleanText = self.removeURL(cleanText)
81
+ # removal of HTML Tags
82
+ cleanText = self.removeHTMLTag(cleanText)
83
+ # remove \n or \t and unnecessary white space
84
+ cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
85
+ # stop-words removal
86
+ filteredWord = self.removeStopWords(cleanText)
87
+ # a sentence or the capital word behind a period "."
88
+ return cleanText, filteredWord
89
+ #generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
90
+ #saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
91
+ def splitStickWords(self,word):
92
+ #output = []
93
+ split_words = segment(word)
94
+ '''for w in split_words:
95
+ pos = word.lower().find(w)
96
+ if word[pos].isupper() == True:
97
+ output.append(w[0].upper() + w[1:])
98
+ else:
99
+ output.append(w)
100
+ if pos >=0:
101
+ if pos+len(w)<len(word):
102
+ if word[pos+len(w)] == ".":
103
+ output[-1] = output[-1] + "." '''
104
+ return " ".join(split_words)
105
+ def removeDOI(self, word, doiLink=None):
106
+ # if they have the word DOI in that: ex: 1368598DOI after general clean
107
+ if "DOI" in word:
108
+ word = word.replace(word,"")
109
+ # if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
110
+ if doiLink != None:
111
+ w = self.splitStickWords(word)
112
+ cleanDOI = self.removePunct(doiLink)
113
+ if cleanDOI in w:
114
+ word = w.replace(cleanDOI,"")
115
+ return word
core/NER/html/__pycache__/extractHTML.cpython-310.pyc ADDED
Binary file (6.3 kB). View file
 
core/NER/html/__pycache__/extractHTML.cpython-311.pyc ADDED
Binary file (13.1 kB). View file
 
core/NER/html/extractHTML.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ from core.DefaultPackages import openFile, saveFile
5
+ from core.NER import cleanText
6
+ import pandas as pd
7
+ class HTML():
8
+ def __init__(self, htmlFile, htmlLink):
9
+ self.htmlLink = htmlLink
10
+ self.htmlFile = htmlFile
11
+ # def openHTMLFile(self):
12
+ # headers = {
13
+ # "User-Agent": (
14
+ # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
15
+ # "AppleWebKit/537.36 (KHTML, like Gecko) "
16
+ # "Chrome/114.0.0.0 Safari/537.36"
17
+ # ),
18
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
19
+ # "Referer": self.htmlLink,
20
+ # "Connection": "keep-alive"
21
+ # }
22
+
23
+ # session = requests.Session()
24
+ # session.headers.update(headers)
25
+
26
+ # if self.htmlLink != "None":
27
+ # try:
28
+ # r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
29
+ # if r.status_code != 200:
30
+ # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
31
+ # return BeautifulSoup("", 'html.parser')
32
+ # soup = BeautifulSoup(r.content, 'html.parser')
33
+ # except Exception as e:
34
+ # print(f"❌ Exception fetching HTML: {e}")
35
+ # return BeautifulSoup("", 'html.parser')
36
+ # else:
37
+ # with open(self.htmlFile) as fp:
38
+ # soup = BeautifulSoup(fp, 'html.parser')
39
+ # return soup
40
+ from lxml.etree import ParserError, XMLSyntaxError
41
+
42
+ def openHTMLFile(self):
43
+ not_need_domain = ['https://broadinstitute.github.io/picard/',
44
+ 'https://software.broadinstitute.org/gatk/best-practices/',
45
+ 'https://www.ncbi.nlm.nih.gov/genbank/',
46
+ 'https://www.mitomap.org/']
47
+ headers = {
48
+ "User-Agent": (
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
51
+ "Chrome/114.0.0.0 Safari/537.36"
52
+ ),
53
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
54
+ "Referer": self.htmlLink,
55
+ "Connection": "keep-alive"
56
+ }
57
+
58
+ session = requests.Session()
59
+ session.headers.update(headers)
60
+ if self.htmlLink in not_need_domain:
61
+ return BeautifulSoup("", 'html.parser')
62
+ try:
63
+ if self.htmlLink and self.htmlLink != "None":
64
+ r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
65
+ if r.status_code != 200 or not r.text.strip():
66
+ print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
67
+ return BeautifulSoup("", 'html.parser')
68
+ soup = BeautifulSoup(r.content, 'html.parser')
69
+ else:
70
+ with open(self.htmlFile, encoding='utf-8') as fp:
71
+ soup = BeautifulSoup(fp, 'html.parser')
72
+ except (ParserError, XMLSyntaxError, OSError) as e:
73
+ print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
74
+ return BeautifulSoup("", 'html.parser')
75
+ except Exception as e:
76
+ print(f"❌ General exception for {self.htmlLink}: {e}")
77
+ return BeautifulSoup("", 'html.parser')
78
+
79
+ return soup
80
+
81
+ def getText(self):
82
+ soup = self.openHTMLFile()
83
+ s = soup.find_all("html")
84
+ text = ""
85
+ if s:
86
+ for t in range(len(s)):
87
+ text = s[t].get_text()
88
+ cl = cleanText.cleanGenText()
89
+ text = cl.removeExtraSpaceBetweenWords(text)
90
+ return text
91
+ def getListSection(self, scienceDirect=None):
92
+ try:
93
+ json = {}
94
+ text = ""
95
+ textJson, textHTML = "",""
96
+ if scienceDirect == None:
97
+ soup = self.openHTMLFile()
98
+ # get list of section
99
+ json = {}
100
+ for h2Pos in range(len(soup.find_all('h2'))):
101
+ if soup.find_all('h2')[h2Pos].text not in json:
102
+ json[soup.find_all('h2')[h2Pos].text] = []
103
+ if h2Pos + 1 < len(soup.find_all('h2')):
104
+ content = soup.find_all('h2')[h2Pos].find_next("p")
105
+ nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
106
+ while content.text != nexth2Content.text:
107
+ json[soup.find_all('h2')[h2Pos].text].append(content.text)
108
+ content = content.find_next("p")
109
+ else:
110
+ content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
111
+ json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
112
+ # format
113
+ '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
114
+ 'Results':[], 'Discussion':[], 'References':[],
115
+ 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
116
+ 'Additional information':[], 'Electronic supplementary material':[],
117
+ 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
118
+ if scienceDirect!= None or len(json)==0:
119
+ # Replace with your actual Elsevier API key
120
+ api_key = os.environ["SCIENCE_DIRECT_API"]
121
+ # ScienceDirect article DOI or PI (Example DOI)
122
+ doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
123
+ # Base URL for the Elsevier API
124
+ base_url = "https://api.elsevier.com/content/article/doi/"
125
+ # Set headers with API key
126
+ headers = {
127
+ "Accept": "application/json",
128
+ "X-ELS-APIKey": api_key
129
+ }
130
+ # Make the API request
131
+ response = requests.get(base_url + doi, headers=headers)
132
+ # Check if the request was successful
133
+ if response.status_code == 200:
134
+ data = response.json()
135
+ supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
136
+ if "originalText" in list(supp_data.keys()):
137
+ if type(supp_data["originalText"])==str:
138
+ json["originalText"] = [supp_data["originalText"]]
139
+ if type(supp_data["originalText"])==dict:
140
+ json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
141
+ else:
142
+ if type(supp_data)==dict:
143
+ for key in supp_data:
144
+ json[key] = [supp_data[key]]
145
+
146
+ textJson = self.mergeTextInJson(json)
147
+ textHTML = self.getText()
148
+ if len(textHTML) > len(textJson):
149
+ text = textHTML
150
+ else: text = textJson
151
+ return text #json
152
+ except:
153
+ print("failed all")
154
+ return ""
155
+ def getReference(self):
156
+ # get reference to collect more next data
157
+ ref = []
158
+ json = self.getListSection()
159
+ for key in json["References"]:
160
+ ct = cleanText.cleanGenText(key)
161
+ cleanText, filteredWord = ct.cleanText()
162
+ if cleanText not in ref:
163
+ ref.append(cleanText)
164
+ return ref
165
+ def getSupMaterial(self):
166
+ # check if there is material or not
167
+ json = {}
168
+ soup = self.openHTMLFile()
169
+ for h2Pos in range(len(soup.find_all('h2'))):
170
+ if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
171
+ #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
172
+ link, output = [],[]
173
+ if soup.find_all('h2')[h2Pos].text not in json:
174
+ json[soup.find_all('h2')[h2Pos].text] = []
175
+ for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
176
+ link.append(l["href"])
177
+ if h2Pos + 1 < len(soup.find_all('h2')):
178
+ nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
179
+ if nexth2Link in link:
180
+ link = link[:link.index(nexth2Link)]
181
+ # only take links having "https" in that
182
+ for i in link:
183
+ if "https" in i: output.append(i)
184
+ json[soup.find_all('h2')[h2Pos].text].extend(output)
185
+ return json
186
+ def extractTable(self):
187
+ soup = self.openHTMLFile()
188
+ df = []
189
+ if len(soup)>0:
190
+ try:
191
+ df = pd.read_html(str(soup))
192
+ except ValueError:
193
+ df = []
194
+ print("No tables found in HTML file")
195
+ return df
196
+ def mergeTextInJson(self,jsonHTML):
197
+ cl = cleanText.cleanGenText()
198
+ #cl = cleanGenText()
199
+ htmlText = ""
200
+ for sec in jsonHTML:
201
+ # section is "\n\n"
202
+ if len(jsonHTML[sec]) > 0:
203
+ for i in range(len(jsonHTML[sec])):
204
+ # same section is just a dot.
205
+ text = jsonHTML[sec][i]
206
+ if len(text)>0:
207
+ #text = cl.removeTabWhiteSpaceNewLine(text)
208
+ #text = cl.removeExtraSpaceBetweenWords(text)
209
+ text, filteredWord = cl.textPreprocessing(text, keepPeriod=True)
210
+ jsonHTML[sec][i] = text
211
+ if i-1 >= 0:
212
+ if len(jsonHTML[sec][i-1])>0:
213
+ if jsonHTML[sec][i-1][-1] != ".":
214
+ htmlText += ". "
215
+ htmlText += jsonHTML[sec][i]
216
+ if len(jsonHTML[sec][i]) > 0:
217
+ if jsonHTML[sec][i][-1]!=".":
218
+ htmlText += "."
219
+ htmlText += "\n\n"
220
+ return htmlText
221
+ def removeHeaders(self):
222
+ pass
223
+ def removeFooters(self):
224
+ pass
225
+ def removeReferences(self):
226
+ pass
core/NER/word2Vec/__pycache__/word2vec.cpython-310.pyc ADDED
Binary file (9.44 kB). View file
 
core/NER/word2Vec/__pycache__/word2vec.cpython-311.pyc ADDED
Binary file (19.2 kB). View file
 
core/NER/word2Vec/heuristic.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from datetime import datetime
3
+
4
+ class HeuristicManager:
5
+ def __init__(self, model, log_file="heuristic_log.txt", min_similarity_threshold=0.5, min_new_data_len=50):
6
+ self.model = model
7
+ self.min_similarity_threshold = min_similarity_threshold
8
+ self.min_new_data_len = min_new_data_len
9
+ self.log_file = log_file
10
+ logging.basicConfig(filename=self.log_file, level=logging.INFO)
11
+
12
+ def log(self, message):
13
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
14
+ logging.info(f"[{timestamp}] {message}")
15
+ print(f"[{timestamp}] {message}")
16
+
17
+ def check_similarity(self, test_terms):
18
+ triggers = []
19
+ for term in test_terms:
20
+ try:
21
+ sim = self.model.wv.most_similar(term)[0][1]
22
+ if sim < self.min_similarity_threshold:
23
+ triggers.append(f"Low similarity for '{term}': {sim}")
24
+ except KeyError:
25
+ triggers.append(f"'{term}' not in vocabulary")
26
+ return triggers
27
+
28
+ def check_metadata(self, metadata):
29
+ triggers = []
30
+ if any(keyword in str(metadata).lower() for keyword in ["haplogroup b", "eastasia", "asian"]):
31
+ triggers.append("Detected new haplogroup or regional bias: 'Asian' or 'B'")
32
+ return triggers
33
+
34
+ def check_new_data_volume(self, new_data):
35
+ if len(new_data) < self.min_new_data_len:
36
+ return ["Not enough new data to justify retraining"]
37
+ return []
38
+
39
+ def should_retrain(self, test_terms, new_data, metadata):
40
+ triggers = []
41
+ triggers += self.check_similarity(test_terms)
42
+ triggers += self.check_metadata(metadata)
43
+ triggers += self.check_new_data_volume(new_data)
44
+
45
+ if triggers:
46
+ self.log("Retraining triggered due to:")
47
+ for trigger in triggers:
48
+ self.log(f" - {trigger}")
49
+ return True
50
+ else:
51
+ self.log("No retraining needed.")
52
+ return False
core/NER/word2Vec/testModel/test_model.model ADDED
Binary file (25.2 kB). View file
 
core/NER/word2Vec/testModel/test_model.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 24 100
2
+ dna -0.0005385255 0.0002430238 0.005111818 0.009016951 -0.009293036 -0.007109866 0.0064572324 0.008987154 -0.0050192317 -0.0037659889 0.0073785 -0.0015431087 -0.0045221853 0.006557529 -0.004854595 -0.0018278129 0.002881375 0.0010002495 -0.00829578 -0.009462763 0.007312361 0.0050688535 0.0067577288 0.0007685764 0.006347226 -0.003397316 -0.0009421973 0.0057741464 -0.007532499 -0.0039303782 -0.0075064874 -0.0009439946 0.009533595 -0.0073319245 -0.002333888 -0.0019326513 0.0080786925 -0.005930193 3.549824e-05 -0.00475331 -0.0095964745 0.005000012 -0.008770563 -0.0043735923 -2.9246534e-05 -0.00030931013 -0.007669701 0.009599569 0.004982613 0.009233704 -0.008148657 0.004488859 -0.0041414667 0.00081141765 0.008487031 -0.00446156 0.0045125154 -0.006793622 -0.0035560841 0.009394251 -0.0015774865 0.00032431752 -0.004129968 -0.0076763057 -0.0015165819 0.0024841889 -0.00088440755 0.0055526863 -0.0027446826 0.002259023 0.0054701897 0.008356409 -0.0014508999 -0.009201209 0.004375452 0.00058271736 0.0074576377 -0.00080706284 -0.0026372937 -0.008752899 -0.00087625836 0.00282087 0.005398569 0.0070530027 -0.0057170955 0.0018605916 0.006099475 -0.0048024287 -0.003104349 0.0067992285 0.0016360026 0.00019302641 0.00348545 0.00021818833 0.009630539 0.0050670514 -0.008908632 -0.007042295 0.0009007676 0.0063867364
3
+ from -0.00861988 0.0036778022 0.005193427 0.005744547 0.0074751326 -0.0061739217 0.0011082628 0.0060625207 -0.0028567386 -0.006184132 -0.00041290926 -0.008384168 -0.0055893976 0.007104685 0.003362318 0.007228353 0.0068033817 0.007533677 -0.003792071 -0.000581891 0.0023577819 -0.0045196284 0.008395244 -0.009858517 0.006761404 0.0029261683 -0.004930935 0.0043925527 -0.0017370671 0.006713542 0.009974645 -0.0043735756 -0.0006050642 -0.005716478 0.003858548 0.002799571 0.00690247 0.00610934 0.009526547 0.009269763 0.007910428 -0.007008808 -0.00916451 -0.00033672128 -0.0030898354 0.007890073 0.005923819 -0.001552973 0.001516021 0.0017856265 0.007822941 -0.009514211 -0.00020886083 0.0034666678 -0.00094713847 0.008384139 0.009009283 0.0065234327 -0.0007208324 0.007705209 -0.00853289 0.0032079336 -0.004625999 -0.0050743804 0.0035901158 0.005388813 0.007766254 -0.005744939 0.0074327383 0.006626378 -0.003704473 -0.008735958 0.005445474 0.0065230317 -0.000784768 -0.006700798 -0.007075852 -0.002488528 0.0051543443 -0.0036620772 -0.00938257 0.003815971 0.004890136 -0.0064404616 0.0012033634 -0.0020763231 2.994902e-05 -0.0098790005 0.002700701 -0.004756241 0.0011076172 -0.0015674155 0.0022046466 -0.00787344 -0.0027070795 0.002668326 0.0053478787 -0.002396734 -0.009512201 0.0045024394
4
+ mtdna 8.645293e-05 0.003076037 -0.006815487 -0.0013743688 0.0076927417 0.0073529496 -0.0036715195 0.0026677884 -0.008309281 0.00619759 -0.00463892 -0.0031715294 0.009313415 0.00088058383 0.0074962615 -0.00608139 0.005167896 0.009930803 -0.008471472 -0.0051321597 -0.007057574 -0.0048644566 -0.003772668 -0.008518714 0.0079532955 -0.0048361127 0.008438283 0.005270068 -0.0065578814 0.0039592343 0.005482614 -0.007444929 -0.0074228924 -0.002492343 -0.008628872 -0.0015748737 -0.00038757667 0.0032959366 0.0014325404 -0.00088083016 -0.005591098 0.0017297626 -0.00089552783 0.0068030986 0.0039881677 0.004533183 0.0014284542 -0.0027126821 -0.0043595196 -0.0010315293 0.0014437438 -0.0026617546 -0.0070882514 -0.007825746 -0.009136036 -0.005931676 -0.001850123 -0.004323682 -0.0064626597 -0.0037265678 0.004296681 -0.0037233941 0.008404572 0.001539496 -0.007246572 0.009443451 0.007636867 0.0055208146 -0.0068550883 0.0058190743 0.004034045 0.005188155 0.0042629624 0.0019477821 -0.003167882 0.008342064 0.009619138 0.0038047181 -0.0028461283 5.6938893e-07 0.0012001555 -0.0084682545 -0.008234347 -0.00023238244 0.0012304098 -0.005750644 -0.0047139754 -0.0073490315 0.008316314 0.00010242269 -0.004513882 0.005704978 0.009199796 -0.004097329 0.007985275 0.005386452 0.0058861696 0.0005043713 0.008208188 -0.0070221694
5
+ in -0.008226077 0.009303831 -0.00018710589 -0.0019704443 0.0046143015 -0.004104392 0.0027394402 0.006979235 0.0060486975 -0.0075411424 0.00939576 0.00465202 0.004012172 -0.006245291 0.008499353 -0.002164537 0.008836197 -0.005347778 -0.008136817 0.006804632 0.0016640095 -0.0022142953 0.009522269 0.009494823 -0.0097868545 0.0025105644 0.0061560757 0.0038842657 0.0020310257 0.00043876152 0.00068163266 -0.0038464246 -0.007141551 -0.0020813115 0.003930752 0.008838634 0.009274302 -0.0059668766 -0.009419525 0.009759848 0.0034291998 0.005158939 0.006265811 -0.0027623416 0.007310359 0.0027998323 0.0028576967 -0.0023982434 -0.003139742 -0.0023701421 0.0042809984 4.8589092e-05 -0.009614385 -0.00968607 -0.006160773 -0.00011437661 0.0019819876 0.009428 0.0056011924 -0.004298171 0.00026028603 0.004974084 0.007744428 -0.001135339 0.004278759 -0.0057750097 -0.0008068469 0.00811882 -0.002369315 -0.009674972 0.0058119837 -0.0039038642 -0.001220125 0.010017389 -0.002241946 -0.0047185957 -0.0053141676 0.0069846674 -0.005741993 0.002120917 -0.0052751247 0.00613608 0.0043662013 0.0026298608 -0.0015129133 -0.002735619 0.008999614 0.0052172863 -0.0021470466 -0.009465257 -0.007413552 -0.0010587372 -0.00078251073 -0.0025414668 0.009710779 -0.00044944565 0.005915 -0.007467981 -0.0024928953 -0.005583053
6
+ european -0.007147033 0.0012623417 -0.007189088 -0.0022513974 0.0037773554 0.005857864 0.0012027922 0.0021598793 -0.004109796 0.007198152 -0.006319537 0.0046250015 -0.008186181 0.0020334523 -0.0049318667 -0.0042960607 -0.0030848773 0.0056965156 0.0057683894 -0.004991361 0.00076802005 -0.008515792 0.0078122346 0.009295911 -0.002746969 0.0008081935 0.0007694419 0.00550255 -0.008630911 0.0006062931 0.0068933573 0.0021813295 0.0010798875 -0.009366349 0.008471645 -0.006258249 -0.0029761735 0.0035168754 -0.00078163494 0.0014152499 0.0017921324 -0.006839617 -0.009737293 0.009092817 0.0062128166 -0.00694695 0.0033956417 0.00017217748 0.004755041 -0.0071203653 0.004067516 0.004303939 0.009927 -0.0045391554 -0.0014395243 -0.0073114103 -0.009704934 -0.009090646 -0.0010375449 -0.0065315044 0.0048550633 -0.006148244 0.0026037877 0.000752482 -0.0034296552 -0.00092229253 0.010017935 0.009206015 -0.004494388 0.009070265 -0.0055859834 0.0059493524 -0.0030818144 0.0034673577 0.003029479 0.0069394265 -0.0023470228 0.008820008 0.0075530927 -0.009551933 -0.008064042 -0.007652859 0.0029148757 -0.0027951996 -0.00694831 -0.008136711 0.008356287 0.0019903474 -0.00933717 -0.004817203 0.0031394493 -0.0046995636 0.005327329 -0.0042287502 0.0027155946 -0.008033582 0.0062630265 0.0047997306 0.00079031993 0.0029888113
7
+ common -0.008722234 0.0021272295 -0.0008539916 -0.009321866 -0.0094246445 -0.001412531 0.0044288053 0.00372704 -0.006505282 -0.006894708 -0.0049991854 -0.0023061878 -0.007229156 -0.009607243 -0.0027377736 -0.008360431 -0.0060269493 -0.005675304 -0.00234906 -0.0017278373 -0.008954683 -0.000731004 0.008155364 0.007693106 -0.007208155 -0.003644954 0.0031189725 -0.009568674 0.0014795078 0.0065395026 0.0057490384 -0.008770905 -0.0045228535 -0.008156553 4.5400484e-05 0.00927559 0.005980464 0.0050585535 0.0050439127 -0.0032448657 0.009562716 -0.0073605715 -0.0072781076 -0.002255642 -0.00077679846 -0.0032283778 -0.00060498127 0.007476424 -0.00070291053 -0.0016193221 0.002749461 -0.008367007 0.0078366995 0.008528508 -0.009591924 0.0024459555 0.009891981 -0.007673955 -0.006969234 -0.0077365288 0.008389148 -0.00067644875 0.009162579 -0.008137346 0.0037369097 0.0026538277 0.0007320811 0.002340243 -0.007473436 -0.009367513 0.0023810826 0.0061679846 0.007993824 0.005740968 -0.00078188477 0.008307063 -0.009312772 0.0033975116 0.00027130058 0.003872196 0.007375048 -0.0067289495 0.005584901 -0.0095183 -0.0008194822 -0.008691651 -0.0050952802 0.009296191 -0.0018460032 0.0029113942 0.009088126 0.008946764 -0.008196811 -0.0030016953 0.009896215 0.005113277 -0.0015862831 -0.008699891 0.0029696936 -0.0066840183
8
+ sequence 0.008134779 -0.0044588344 -0.0010699655 0.001010431 -0.00018677961 0.0011458534 0.0061133304 -1.2402037e-05 -0.0032534893 -0.0015101052 0.0058955555 0.0015073137 -0.0007181427 0.009341042 -0.004917502 -0.0008413052 0.009177319 0.0067567485 0.0015022643 -0.0088886535 0.0011522508 -0.0022903979 0.009365224 0.0012041465 0.0014943897 0.0024040388 -0.0018358674 -0.004996856 0.00023002276 -0.0020175653 0.0066060103 0.008935089 -0.0006746635 0.0029776676 -0.0061099143 0.0017025766 -0.006924371 -0.008690522 -0.005899618 -0.008961226 0.0072769034 -0.005776607 0.00827455 -0.007233702 0.003422895 0.009676102 -0.0077943387 -0.009949275 -0.0043248134 -0.0026828882 -0.0002740396 -0.008833413 -0.008620106 0.0027985822 -0.008205106 -0.009067738 -0.0023404285 -0.00863584 -0.007056119 -0.008398832 -0.0003011976 -0.0045611723 0.006630901 0.0015288803 -0.0033471577 0.006116343 -0.0060124504 -0.004648673 -0.0072044823 -0.0043340866 -0.0018032556 0.00649206 -0.0027680297 0.004921421 0.006912646 -0.007459126 0.004573438 0.006129695 -0.002956148 0.0066218316 0.006121442 -0.0064460207 -0.0067676785 0.002543585 -0.0016248615 -0.006062931 0.009498339 -0.005135456 -0.006549685 -0.000118091535 -0.002699267 0.00044816377 -0.0035289875 -0.00041692218 -0.00070437486 0.00083035015 0.0081978375 -0.005737508 -0.0016556873 0.005569238
9
+ bru18 0.008155276 -0.0044185193 0.008987652 0.008259665 -0.0044238693 0.00031090993 0.004277394 -0.0039252234 -0.0055654007 -0.006509729 -0.0006656875 -0.00030213682 0.004489389 -0.0024855223 -0.00015437756 0.0024471143 0.0048732683 -2.8606542e-05 -0.0063628056 -0.009279111 1.8654398e-05 0.006667726 0.0014650559 -0.0089674555 -0.007945727 0.006548857 -0.0037690091 0.006254232 -0.0067004655 0.008482541 -0.0065189763 0.0032740948 -0.001067833 -0.0067885593 -0.0032949874 -0.0011434925 -0.005471747 -0.001204045 -0.0075744605 0.0026601462 0.009080238 -0.0023750134 -0.0009867329 0.0035252234 0.008680149 -0.0059299506 -0.006889695 -0.002942458 0.00913801 0.0008666254 -0.008663911 -0.001442217 0.009477263 -0.0075691855 -0.0053729587 0.009308613 -0.008970956 0.0038234547 0.00065334333 0.0066515543 0.008311967 -0.002862157 -0.003982641 0.008891435 0.0020839446 0.0062542376 -0.009450494 0.0095988605 -0.0013514485 -0.006062315 0.0029950105 -0.0004512243 0.0047055846 -0.0022705523 -0.004145877 0.0022992992 0.008370594 -0.004990823 0.0026696166 -0.00798221 -0.0067810714 -0.000469271 -0.008768882 0.0027844147 0.0015907697 -0.0023179457 0.005011737 0.009743466 0.008472866 -0.001870301 0.0020416898 -0.0039901678 -0.008234559 0.0062697986 -0.0019247098 -0.00066059735 -0.0017619281 -0.004536765 0.004069 -0.0042896206
10
+ bru50 -0.009579504 0.008948466 0.0041579367 0.00923892 0.006649052 0.0029269105 0.009801864 -0.0044190143 -0.0068119396 0.004226486 0.0037328962 -0.005664456 0.009715384 -0.0035591167 0.009558758 0.00083636935 -0.006334789 -0.0019748765 -0.007390546 -0.002990235 0.0010405012 0.009480547 0.009361016 -0.0065955063 0.0034724285 0.0022746115 -0.0024764987 -0.009228658 0.0010185506 -0.008164371 0.0063289437 -0.0058100903 0.005530614 0.009826734 -0.00015984276 0.0045368825 -0.0018012718 0.0073676347 0.0039300686 -0.0090082595 -0.0023973046 0.0036249864 -0.00010732573 -0.0011888575 -0.0010430571 -0.0016724848 0.00059902505 0.0041630277 -0.004250072 -0.0038341933 -5.2427928e-05 0.00026678806 -0.00017553278 -0.0047934647 0.0043008197 -0.002173452 0.0020970574 0.00065915886 0.005959963 -0.0068526124 -0.00680708 -0.004473089 0.009448878 -0.001590459 -0.009438289 -0.000534792 -0.0044530216 0.0060103727 -0.009585406 0.002857136 -0.009246552 0.001258808 0.0059965253 0.0074065947 -0.007623657 -0.0060443347 -0.006831209 -0.007910946 -0.009496376 -0.0021281417 -0.0008362788 -0.007265241 0.0067816544 0.0011141741 0.0058228294 0.0014675015 0.00078702695 -0.007366497 -0.0021715113 0.0043177926 -0.005089294 0.001137756 0.0028883398 -0.0015285894 0.009943532 0.008348668 0.0024183327 0.007110643 0.005890512 -0.005592114
11
+ vietnam -0.005153963 -0.0066644135 -0.007776157 0.0083126435 -0.0019782323 -0.006856599 -0.004155673 0.0051580225 -0.0028790692 -0.0037560624 0.0016262402 -0.00278304 -0.001570952 0.0010760438 -0.002967586 0.008515032 0.003917556 -0.009953211 0.0062494674 -0.0067655 0.00076895714 0.0043992978 -0.005096968 -0.0021128112 0.00809259 -0.0042428537 -0.0076304777 0.009258844 -0.0021577128 -0.004717085 0.008580298 0.004269408 0.004324098 0.009280228 -0.008452614 0.0052631963 0.0020472223 0.004193831 0.0016919046 0.004460046 0.0044873925 0.0060984488 -0.0032084621 -0.0045590503 -0.0004232687 0.002529075 -0.0032731881 0.006051339 0.0041546253 0.00776509 0.002568826 0.008108382 -0.0013972289 0.008070817 0.003707151 -0.008045609 -0.00393531 -0.0024772724 0.004889826 -0.00087688275 -0.00282919 0.007839672 0.009338199 -0.0016121961 -0.0051723607 -0.0046861414 -0.0048465827 -0.0095901145 0.0013706182 -0.0042283125 0.002539541 0.0056244545 -0.00406352 -0.009583576 0.0015531465 -0.006689678 0.0025049727 -0.0037749638 0.007073151 0.00063951715 0.0035553342 -0.0027433916 -0.001711565 0.007655947 0.0014000075 -0.005851 -0.007834303 0.0012315387 0.006458937 0.0055561876 -0.00897213 0.008598417 0.0040550055 0.007476387 0.00975736 -0.007282407 -0.009030263 0.0058277464 0.009392481 0.0034955258
12
+ sample 0.007100903 -0.0015709094 0.007947078 -0.00948947 -0.00802812 -0.006650821 -0.004002562 0.00500194 -0.0038224515 -0.008330948 0.00841617 -0.0037529538 0.008619977 -0.004892141 0.003931126 0.004920354 0.0023956115 -0.0028135795 0.0028564015 -0.008257614 -0.0027645228 -0.0026008752 0.007249391 -0.0034709626 -0.0066022277 0.0043369113 -0.0004823991 -0.0035912786 0.006893536 0.003869671 -0.0038965137 0.0007677057 0.009145668 0.0077625574 0.0063656354 0.004670941 0.0023901698 -0.0018358309 -0.006370667 -0.00030689163 -0.0015674513 -0.00057719386 -0.0062623145 0.0074473424 -0.0066001806 -0.007243944 -0.0027626618 -0.0015170419 -0.007635178 0.0006969715 -0.005330137 -0.0012829994 -0.007370956 0.0019601034 0.003276234 -1.4737604e-05 -0.005451358 -0.001723771 0.00709824 0.003738 -0.008888436 -0.0034084066 0.0023648455 0.0021412992 -0.009477984 0.004583573 -0.008656226 -0.007383396 0.0034825006 -0.0034719554 0.0035707187 0.008896884 -0.003571185 0.009332037 0.0017215977 0.009857596 0.005704204 -0.009146731 -0.0033407472 0.0065290304 0.0055978918 0.008714949 0.0069304765 0.008049887 -0.009821734 0.004303451 -0.0050309277 0.0035138857 0.0060621244 0.0043927776 0.007520648 0.0014953684 -0.0012639741 0.0057787485 -0.0056348047 4.0551466e-05 0.009468461 -0.005486985 0.0038199269 -0.008121091
13
+ collected 0.0097750295 0.008170629 0.0012814446 0.0051154387 0.0014172737 -0.006454876 -0.0014259414 0.0064561926 -0.004619688 -0.0039992593 0.004923175 0.0027045405 -0.0018415204 -0.0028716852 0.006021755 -0.005721393 -0.003250512 -0.0064803455 -0.0042360183 -0.008592084 -0.004467861 -0.008505252 0.0013975133 -0.008609542 -0.009919709 -0.008202052 -0.0067797694 0.006683116 0.0037784956 0.0003495915 -0.002959815 -0.007438984 0.0005348175 0.0005005026 0.00019596443 0.0008583165 0.00078985846 -5.4285138e-05 -0.008013045 -0.005872034 -0.00837931 -0.0013207265 0.0018039295 0.0074345516 -0.001966708 -0.0023440684 0.009481904 7.425008e-05 -0.0023982543 0.008607863 0.0026964454 -0.0053582233 0.0065950346 0.0045082304 -0.0070585674 -0.00031050213 0.00083163293 0.005739447 -0.0017207591 -0.0028131874 0.0017429565 0.00085032795 0.0012085037 -0.002637083 -0.0060016937 0.007339091 0.0075857476 0.00830421 -0.008602928 0.0026385786 -0.0035621128 0.0096288975 0.0029010975 0.004643974 0.0023910597 0.006626162 -0.005746352 0.007899223 -0.0024186398 -0.0045691207 -0.0020768652 0.009735589 -0.0068560173 -0.0021970137 0.006994984 -4.366915e-05 -0.0062879827 -0.006398747 0.008941079 0.0064397687 0.004773856 -0.003261329 -0.009269935 0.0038002136 0.0071752095 -0.0056398017 -0.007860231 -0.0029721109 -0.0049388385 -0.0023143636
14
+ europe -0.0019466967 -0.005264445 0.009446078 -0.009301849 0.00450806 0.005410841 -0.0014122794 0.009008321 0.009883694 -0.0054709506 -0.0060238987 -0.006749262 -0.007891144 -0.0030501 -0.00559189 -0.008350158 0.000785714 0.002999436 0.0064088805 -0.0026336086 -0.0044599404 0.0012484614 0.00038998463 0.008114584 0.00018636887 0.0072303875 -0.008259172 0.008436813 -0.0018950498 0.008705898 -0.007616939 0.0017924334 0.0010528992 4.4615095e-05 -0.005109563 -0.009249746 -0.0072665187 -0.007951877 0.0019136231 0.00048003704 -0.0018163731 0.007123826 -0.0024782037 -0.0013449806 -0.008898934 -0.0099250255 0.008953352 -0.0057566464 -0.006378906 0.0052002883 0.0066733453 -0.0068328637 0.000956345 -0.0060142023 0.0016413335 -0.004295812 -0.0034417375 0.0021831726 0.008657248 0.0067267795 -0.00967649 -0.0056275628 0.007884859 0.0019889344 -0.0042598336 0.0006024022 0.009526292 -0.0011015745 -0.009430234 0.0016114928 0.0062343916 0.00628738 0.0040935944 -0.0056507527 -0.000374705 -4.9610684e-05 0.004579015 -0.0080420235 -0.008019654 0.0002663556 -0.008607854 0.005816331 -0.00042231655 0.00997148 -0.0053460747 -0.00048954826 0.0077552027 -0.004073562 -0.0050113807 0.0015921831 0.0026467363 -0.0025611357 0.006453244 -0.0076659652 0.003398472 0.00049256504 0.008736541 0.0059848153 0.006820848 0.007819741
15
+ ancient -0.00949331 0.009558393 -0.0077741044 -0.0026378995 -0.0048897555 -0.0049655624 -0.008022211 -0.007766241 -0.0045622233 -0.0012816157 -0.0051147 0.0061208857 -0.009519694 -0.005296118 0.009434444 0.0069931676 0.0076746074 0.0042455657 0.0005105317 -0.0060022003 0.006030395 0.002638317 0.007692142 0.0063923756 0.0079497155 0.008663229 -0.009898174 -0.006753931 0.0013303582 0.0064388 0.0073839277 0.0055065546 0.007657052 -0.0051452103 0.006578382 -0.004109781 -0.009049926 0.009156881 0.0013312489 -0.0027684697 -0.0024686211 -0.004237798 0.004802247 0.00442113 -0.0026455545 -0.0073452652 -0.0035828727 -0.00034474322 0.006112652 -0.0028318586 -0.00011603545 0.0008713841 -0.007088451 0.0020616641 -0.0014378024 0.0028043352 0.0048393123 -0.0013679614 -0.0027919079 0.0077378284 0.005049118 0.006718327 0.0045309924 0.00867961 0.0074680797 -0.0010581953 0.008750674 0.0046186065 0.0054406407 -0.0013790869 -0.0020325198 -0.0044157715 -0.008505952 0.0030342783 0.008892043 0.0089222565 -0.0019243953 0.0060931933 0.0037896668 -0.0043041655 0.002026212 -0.005454141 0.008199508 0.005422219 0.003183278 0.0041012214 0.008660769 0.007268954 -0.0008326238 -0.0070764753 0.008396081 0.0072427383 0.0017482204 -0.0013339228 -0.0058783586 -0.004530154 0.008643081 -0.003131084 -0.006341318 0.009878559
16
+ neanderthal 0.007692736 0.009126856 0.001134214 -0.008323363 0.008438394 -0.0036978398 0.005743373 0.0044079996 0.0096743805 -0.009301011 0.009201668 -0.009297726 -0.0068989955 -0.009099583 -0.0055382987 0.0073707746 0.009167804 -0.0033190295 0.0037136457 -0.0036417823 0.007886165 0.0058672884 4.5112392e-06 -0.0036315187 -0.0072244583 0.0047761244 0.0014634884 -0.002615084 0.007832942 -0.004045295 -0.00913638 -0.0022702827 0.00011177889 -0.006659164 -0.0054871286 -0.008484606 0.00924395 0.0074312175 -0.00030530593 0.0073675984 0.0079630045 -0.0007988404 0.0066030715 0.0037836921 0.0050928146 0.0072574555 -0.004751798 -0.0021930316 0.00087973 0.0042327694 0.0033078827 0.0050869007 0.004582786 -0.008444151 -0.0031969673 -0.007233252 0.009679768 0.0049946425 0.0001599608 0.0041068383 -0.0076482734 -0.0062929546 0.003092239 0.006544919 0.0039503933 0.006035828 -0.0019895614 -0.0033235473 0.00020525315 -0.0031931365 -0.005507259 -0.0077802544 0.0065467777 -0.0010795805 -0.0018928167 -0.007799526 0.009349405 0.00087477046 0.0017788016 0.0024914553 -0.0073950374 0.0016234348 0.0029714536 -0.008580277 0.0049522887 0.0024255016 0.0074964412 0.0050449395 -0.0030210917 -0.0071717766 0.007105708 0.0019140064 0.005210298 0.0063858717 0.0019259832 -0.0061174775 -5.528207e-06 0.008260976 -0.0060965912 0.009431074
17
+ modern -0.0071792696 0.0042354544 0.00216289 0.007438057 -0.0048900596 -0.0045788498 -0.0060949842 0.0033097882 -0.004507435 0.008506253 -0.0042799306 -0.009108578 -0.0047961376 0.0064152437 -0.006351414 -0.0052630682 -0.007296127 0.006024725 0.003365447 0.0028487756 -0.0031356772 0.00602019 -0.0061529716 -0.001984372 -0.0059886468 -0.0009987217 -0.0020279228 0.008489572 9.179515e-05 -0.0085772425 -0.0054273363 -0.0068765874 0.0026914866 0.00946441 -0.0058075436 0.008274624 0.008538083 -0.007054826 -0.008883825 0.009470304 0.008378029 -0.0046964334 -0.0067229234 0.007853816 0.003754884 0.008087255 -0.0075793806 -0.009526273 0.0015759452 -0.009809055 -0.004886255 -0.003462314 0.009610498 0.008620381 -0.002831389 0.005837147 0.008235405 -0.002257783 0.009542199 0.0071611865 0.0020309114 -0.0038430467 -0.005072538 -0.00304804 0.007877576 -0.0061799455 -0.0029184332 0.009190523 0.003460949 0.0060627563 -0.008025261 -0.00075433304 0.0055211782 -0.0046972577 0.0074892025 0.009333807 -0.00041072394 -0.0020574103 -0.00060545607 -0.0057792794 -0.0083910655 -0.0014910942 -0.0025447267 0.0043934747 -0.006866489 0.00542165 -0.006739068 -0.0078106844 0.008480591 0.008917766 -0.0034737175 0.0034897032 -0.005797486 -0.008738294 -0.0055089584 0.0067478465 0.0064329007 0.009427363 0.007059985 0.0067415633
18
+ human 0.0013073076 -0.009817197 0.0046000797 -0.00054215814 0.0063516907 0.0017917434 -0.0031376705 0.00779152 0.0015605913 4.5087592e-05 -0.004629277 -0.008477088 -0.0077653346 0.00868444 -0.0089293 0.009021215 -0.009282701 -0.00026340262 -0.0019013402 -0.008945062 0.008634705 0.006775237 0.0030073978 0.00484689 0.000119797296 0.009438227 0.007017406 -0.009846283 -0.0044378787 -0.0012810889 0.0030511408 -0.0043373024 0.0014413317 -0.007862512 0.002772104 0.0047001 0.004937028 -0.0031820575 -0.008430869 -0.009233454 -0.00072350266 -0.007335406 -0.0068239835 0.006137866 0.0071648457 0.0021028868 -0.00790615 -0.0057202103 0.008053211 0.0039317366 -0.0052275606 -0.007412702 0.00076265965 0.0034572822 0.002076003 0.0031028383 -0.0056280685 -0.0099016195 -0.0070258062 0.00023322599 0.0046109683 0.004535595 0.0018992841 0.0051839855 -0.000116945404 0.004136494 -0.009110944 0.0077172276 0.0061438708 0.0051303217 0.0072363587 0.0084579345 0.00074768433 -0.0017087719 0.0005303956 -0.009314834 0.008429295 -0.0063797934 0.008425091 -0.0042409054 0.0006248087 -0.009168093 -0.009569658 -0.007833339 -0.0077458574 0.00037962993 -0.0072201644 -0.004963075 -0.0052754995 -0.004289475 0.0070301695 0.004834569 0.008708495 0.0070971223 -0.0056847483 0.007253502 -0.009290819 -0.0025857396 -0.007757146 0.0042008474
19
+ genome 0.0018013249 0.0070483726 0.002941503 -0.006984167 0.0077269375 -0.005990631 0.008982948 0.0029859466 -0.0040263417 -0.0046959417 -0.004423949 -0.006166649 0.009397486 -0.0026410713 0.00779025 -0.009682492 0.0021134273 -0.001217051 0.007545118 -0.009060286 0.007431912 -0.005112224 -0.006022511 -0.0056468663 -0.0033655176 -0.0034046597 -0.0031906026 -0.007475777 0.0007148267 -0.0005725245 -0.0016790004 0.0037438255 -0.00763313 -0.0032234066 0.00514847 0.00855509 -0.009791086 0.0071872775 0.0052953 -0.003874173 0.008570203 -0.009222292 0.0072385296 0.0053781155 0.0012898272 -0.0051951176 -0.004179599 -0.003369767 0.0015944163 0.001581598 0.007396833 0.0099602975 0.008836587 -0.004008733 0.009636086 -0.00063042255 0.0048575792 0.0025363516 -0.0006256454 0.0036644523 -0.005330011 -0.0057551167 -0.007577021 0.0019176035 0.006513916 0.00090115983 0.0012633507 0.0031810037 0.008123854 -0.007687061 0.0022752027 -0.007455608 0.003715618 0.009514587 0.0075186947 0.006441567 0.008026117 0.006552105 0.0068467325 0.00869257 -0.0049556913 0.009209661 0.0050575286 -0.0021248695 0.008474546 0.005080482 0.009641399 0.0028190457 0.009884555 0.001195692 0.009130684 0.0035973836 0.006580412 -0.00361116 0.0068057566 0.007250423 -0.002115621 -0.0018615718 0.003625693 -0.0070385
20
+ shows 0.009741375 -0.009785563 -0.006502033 0.0027767855 0.0064354893 -0.005370729 0.0027519849 0.009131747 -0.006819064 -0.0061066505 -0.0049928115 -0.00368126 0.0018522884 0.009683641 0.00644354 0.00039165124 0.0024744181 0.00844649 0.009138178 0.005629969 0.005943013 -0.007629522 -0.0038295696 -0.005683565 0.0061836103 -0.00225932 -0.008786562 0.0076284255 0.008406309 -0.0033179314 0.009119112 -0.00073907804 -0.0036286868 -0.0003802314 0.00019241076 -0.0035078088 0.0028134247 0.005731432 0.006873956 -0.008905951 -0.0021951643 -0.0054816343 0.0075234827 0.0065075015 -0.0043688817 0.002324414 -0.0059516523 0.00023538349 0.00945961 -0.0026105444 -0.0051873005 -0.0074033006 -0.0029152564 -0.0008664178 0.0035291065 0.009743326 -0.0033921245 0.001903681 0.009692432 0.0015337794 0.0009810732 0.009802843 0.00930645 0.007710903 -0.006179333 0.009991138 0.005857104 0.009073708 -0.002001237 0.0033512171 0.0068392376 -0.0038913293 0.006648019 0.0025668114 0.009319553 -0.0030298685 -0.0031094935 0.0062168743 -0.00908894 -0.0072543155 -0.006503641 -0.00074380165 -0.002362113 0.0068256087 0.009239293 -0.00091146474 0.0014132133 0.002020571 -0.0020174456 -0.008035576 0.007445874 -0.004299319 0.004580612 0.009090945 0.0030486963 0.00313993 0.0040727276 -0.0027017219 0.0038345656 0.00033530922
21
+ variation 0.005626712 0.005497371 0.0018291199 0.0057494068 -0.008968078 0.0065593575 0.009225992 -0.0042071473 0.0016075504 -0.0052338815 0.0010582185 0.0027701687 0.008160736 0.00054401276 0.0025570584 0.001297735 0.008402523 -0.0057077026 -0.00626183 -0.0036275184 -0.0023005498 0.005041063 -0.008120357 -0.0028335357 -0.008197427 0.00514971 -0.0025680638 -0.009067107 0.0040717293 0.009017323 -0.0030376601 -0.0058385395 0.0030198884 -0.00043584823 -0.009979436 0.008417704 -0.0073388875 -0.004930407 -0.002657081 -0.0054523144 0.00171651 0.009712814 0.0045722723 0.008088603 -0.00047045827 0.0006449234 -0.002668352 -0.008779561 0.0034313034 0.0020933736 -0.009421854 -0.004968437 -0.009734099 -0.0057197916 0.0040645422 0.008642861 0.00411165 0.0023884643 0.008144778 -0.0011192096 -0.0013977134 -0.008746823 -0.00012579202 -0.0025675725 0.00038607715 0.007279662 -0.0070414604 -0.0039464748 -0.0066646053 -0.0035441148 -0.0033158315 0.002137121 0.0033281683 -0.004957187 -0.0045462907 0.0011386942 0.0054534827 0.0053736498 -0.0029685367 -0.0042665256 -0.005616647 -0.00054498314 0.001946373 0.0015253461 0.0073525296 -0.0027333724 -6.592393e-05 -0.0055276332 -0.0011700654 -0.0077119637 -0.0009593296 0.0013096749 -0.008594744 0.0087485835 -0.009207866 -0.009624677 -0.008511624 0.0073132683 0.0054655685 0.009249462
22
+ haplogroup 0.0025659278 0.00085168 -0.0025371916 0.00934742 0.0028080416 0.0041162586 -0.0011815964 0.00096541416 0.0066110776 -0.00074895076 0.0033208325 -0.00070219487 0.0052740807 0.003645613 0.0026175152 -0.0053456044 -0.004693721 0.004352339 -0.0059164464 -0.00020070269 -0.0006396672 0.0034715144 -0.008427317 0.0088428045 -0.0014485243 -0.005307692 0.0040584584 -0.001898596 -0.007778139 -0.0044734394 -0.0003679351 -0.0089815045 0.0005416724 0.002407686 -0.003227299 0.0025667753 0.0024930644 0.009990179 0.0014140693 0.0020159276 0.0027784512 -0.0020868885 -0.008718105 0.008073382 -0.0019698895 -0.009723993 -0.006550278 -0.0039781313 0.003948964 0.0050270366 0.0061098747 -0.006815141 0.00066107995 -0.0028290635 -0.0052407067 0.006984182 0.0039222264 -0.003121762 -0.008263934 -0.0051569464 -0.00065567193 0.0078113875 0.006122021 -0.008424067 -0.0096058855 0.0071855173 -0.0022900787 -0.0036282074 0.005704672 -0.0058300486 0.005136189 -0.00020829153 -0.0068513798 -0.00030139415 0.006364283 0.009325248 0.0022419153 0.0050703404 -0.0050120936 -0.0008110871 -0.005373588 0.0011743606 -0.0017981603 -0.0036161384 -0.0070382343 0.009639485 0.003012655 -0.0022897385 -0.0041911877 0.0076894285 -0.0064663296 0.0031200873 0.0008309826 0.008321212 0.0068888706 -0.0028947534 0.002593874 -0.0016730811 -0.009431767 -0.0026270088
23
+ h 0.0013225824 0.0065497826 0.009982806 0.009062454 -0.0079781795 0.0065080435 -0.0057147983 -0.0009299061 0.00047654507 0.0065626903 0.0044563343 0.0045750956 0.0095022535 0.00038496728 -0.0060190535 -0.006347197 0.0064362343 -0.005219293 -0.002869563 0.004042792 -0.002286449 -0.006022882 -0.0023193487 0.0012384101 0.0021826315 0.0061027543 -0.005193723 0.003081824 0.0072158594 0.0022087328 0.0054155486 -0.004879429 0.0061283903 -0.007640156 0.0034881763 -0.009306421 -0.0025874602 -0.00905658 -0.0016061858 -0.005364485 -0.0039271545 0.0011356737 0.002771372 -0.0014860439 -0.008151553 -0.0059441784 0.00080055697 -0.0039708167 -0.009422841 -0.0007733177 0.0066586556 0.005949332 -0.0099333245 0.0030846666 -0.006018299 -0.009179041 0.00015740465 -0.0003979007 -0.006993792 -0.0063003623 -0.0024212876 0.0071041975 -0.0074873487 0.0077126683 -0.000499351 0.001135528 0.009489626 0.0047690077 -0.0035878688 0.00373115 0.0035563034 0.0063642766 7.750339e-05 -0.0044055916 0.001321394 -0.005388977 0.0014417345 0.004943775 0.0051506218 0.009180272 -0.0075472356 -0.005428668 0.0064623333 0.0013423576 -0.0066391225 0.0008783591 0.0027003903 -0.0025289776 -0.004963421 0.0049924683 0.009631416 -0.0073435763 -7.912599e-05 -0.0025523733 -0.0063192695 -0.001368983 -0.005227159 0.009048553 -0.005790704 0.003674939
24
+ is -0.00023357147 0.004226683 0.0021067455 0.009996419 0.0006458492 -0.005461563 -0.0011838758 0.0020920378 -0.0033855627 -0.007853136 -0.005604329 -0.0067612384 0.006366702 0.0039265845 0.008232181 0.0065088123 -0.0061183744 0.002733512 0.008466464 0.0015833755 0.0030677342 0.0058010546 -0.008839754 0.009125629 0.0068226005 0.008512217 -0.0082233 0.0061861346 0.006626654 -0.0013528146 -0.0062799496 0.0053081806 -0.006868758 -0.005337174 0.0035091531 0.008081314 0.008700704 -0.0043939846 -0.0091931205 0.009603682 0.006290027 -0.0039766026 -0.008465367 -0.004691139 -0.0039542373 -0.0032808431 0.0008109401 -0.00030902817 -0.0031103012 -0.005998526 0.009428418 -0.004739384 -0.007274209 0.0076703983 0.0025008747 0.0086274175 -0.004468981 -0.0069012893 0.0009802914 -0.0011801491 -0.009394523 -0.0015968346 0.0030780574 0.006576642 0.0068287384 0.0032347892 -0.0044282703 -0.0018157784 -0.0039494233 0.0057785274 -0.006343468 0.002114367 -0.0013383601 -0.0057999003 -0.007236314 0.0058711045 -0.008345587 -0.00067066104 0.0028193784 0.00773521 -0.007315293 0.003294973 0.009805078 -0.0069755646 -0.003540081 0.005130921 0.005245436 0.0016209023 0.00797557 0.00082546985 0.0018813204 -0.0015988776 -0.008149317 0.0032639706 0.0019852505 -0.008730082 -0.0006569945 7.3046285e-05 -2.6318648e-06 0.008703764
25
+ mitochondrial -0.002508221 -0.0059015388 0.007485539 -0.007257687 -0.008965709 -0.0017888069 -0.008367486 0.00039139786 0.0019467709 -0.0024699308 -0.00644677 -0.00032192905 -0.0010975264 0.0034935323 0.008127049 0.0058537317 0.008440359 -0.0089677265 0.00944024 -0.002368706 0.008696626 0.0023858226 0.0035850583 -0.0095805535 -0.009488111 0.008984071 -0.002896514 0.0028174375 0.0064166263 -0.00029972216 0.00971954 -0.0010352092 -0.009671927 -0.0070548807 -0.0010439103 -0.008674508 0.0074211163 0.0036188734 -0.00874913 0.008480371 0.008929614 0.0058477637 0.0069070626 -0.009568968 0.0004927428 -0.009223568 -0.0036663204 0.00025142074 -0.0002807199 0.0014672013 0.0032786338 0.0021258853 0.005320648 0.0075189634 -0.005886681 0.007957336 0.005991082 0.009785411 0.0046226517 -0.0033269909 -0.0037473391 -0.00062982703 -0.0016548736 0.009871284 0.0011211695 0.00400867 0.0034179776 -0.008850507 0.006720342 0.008190563 -0.0016650181 0.0023356378 -0.0064802184 -0.006126035 0.0082164975 -0.0030429186 0.0067422306 0.001552869 -0.0019822652 0.0030546081 -0.004023311 -0.0017839139 0.0013798403 0.004887597 -0.0014078929 0.0006583137 -0.007930928 0.00949345 -0.008762073 0.007072499 0.0039040898 -0.0069980817 -0.005295161 -0.007937933 -0.0051285303 0.00707022 0.009641066 0.0021544741 0.0006394228 0.009524309
core/NER/word2Vec/testModel/test_model_updated.model ADDED
Binary file (30.7 kB). View file
 
core/NER/word2Vec/word2vec.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''WORD TO VECTOR'''
2
+ import pandas as pd
3
+ import json
4
+ import gensim
5
+ import spacy
6
+ from core.DefaultPackages import openFile, saveFile
7
+ from core.NER import cleanText
8
+ from gensim.models.keyedvectors import KeyedVectors
9
+ from gensim.test.utils import common_texts
10
+ from gensim.models.word2vec import Word2Vec
11
+ from gensim.scripts.glove2word2vec import glove2word2vec
12
+ from gensim.test.utils import datapath, get_tmpfile
13
+ from gensim.models import Phrases
14
+ from gensim.models.phrases import Phraser
15
+ import sys
16
+ import subprocess
17
+ import os
18
+ # can try multiprocessing to run quicker
19
+ import multiprocessing
20
+ import copy
21
+ sys.setrecursionlimit(1000)
22
+ # creat folder word2Vec
23
+ #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
24
+ # create word2vec model
25
+ #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
26
+ '''Some notes for this model
27
+ sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
28
+ a similar word to the word we are finding, so can we try to preprocess text so that
29
+ we make the corpus more effective and only contains the important words. Then when we
30
+ train the model, the important words will be seen as important. Or
31
+ when we already have the similar list of words, we can remove the words in there
32
+ that are stopwords/unnecessary words.'''
33
+ ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
34
+ class word2Vec():
35
+ def __init__(self, nameFile=None, modelName=None):
36
+ self.nameFile = nameFile
37
+ self.modelName = modelName
38
+ #self.nlp = spacy.load("en_core_web_lg")
39
+ self.cl = cleanText.cleanGenText()
40
+ def spacy_similarity(self, word):
41
+ # when use word2vec, try medium or large is better
42
+ # maybe try odc similarity?
43
+ doc = self.nlp(word)
44
+ for token1 in doc:
45
+ for token2 in doc:
46
+ print(token1.text, token2.text, token1.similarity(token2))
47
+ pass
48
+ # clean text before transform to corpus
49
+ def cleanTextBeforeCorpus(self,oriText, doi=None):
50
+ #cl = cleanText.cleanGenText()
51
+ #cl = cleanGenText()
52
+ output = ""
53
+ alreadyRemoveDoi = False
54
+ for word in oriText.split(" "):
55
+ # remove DOI
56
+ if doi != None and doi in oriText:
57
+ if alreadyRemoveDoi == False:
58
+ newWord = self.cl.removeDOI(word,doi)
59
+ if len(newWord) > 0 and newWord != word:
60
+ alreadyRemoveDoi = True
61
+ word = newWord
62
+ # remove punctuation
63
+ # split the sticked words
64
+ #word = cl.splitStickWords(word)
65
+ # remove punctuation
66
+ word = self.cl.removePunct(word,True)
67
+ # remove URL
68
+ word = self.cl.removeURL(word)
69
+ # remove HTMLTag
70
+ word = self.cl.removeHTMLTag(word)
71
+ # remove tab, white space, newline
72
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
73
+ # optional: remove stopwords
74
+ #word = cl.removeStopWords(word)
75
+ if len(word)>0:
76
+ output += word + " "
77
+ return output
78
+ def cleanAllTextBeforeCorpus(self, allText, doi=None):
79
+ cleanOutput = ""
80
+ remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
81
+ if len(allText) > 0:
82
+ corpusText = allText.split("\n\n")
83
+ for pos in range(len(corpusText)):
84
+ lines = corpusText[pos]
85
+ if len(lines) > 0:
86
+ for line in lines.split("\n"):
87
+ if remove in line: line = line.replace(remove, "")
88
+ clean_text = self.cleanTextBeforeCorpus(line, doi)
89
+ cleanOutput += clean_text + "\n"
90
+ cleanOutput += "\n\n"
91
+ return cleanOutput
92
+ import urllib.parse, requests
93
+
94
+ def tableTransformToCorpusText(self, df, excelFile=None):
95
+ # PDF, Excel, WordDoc
96
+ #cl = cleanText.cleanGenText()
97
+ corpus = {}
98
+ # PDF or df
99
+ if excelFile == None:
100
+ if len(df) > 0:
101
+ try:
102
+ for i in range(len(df)):
103
+ # each new dimension/page is considered to be a sentence which ends with the period.
104
+ # each new line is a new list, and each new df is a new corpus
105
+ outputDF = []
106
+ text = df[i].values.tolist()
107
+ if len(text) > 0:
108
+ outputRowDF = self.helperRowTableToCorpus(text)
109
+ #outputColDF = self.helperColTableToCorpus(text)
110
+ outputDF.extend(outputRowDF)
111
+ #outputDF.extend(outputColDF)
112
+ if len(outputDF) > 0:
113
+ corpus["corpus" + str(i)] = outputDF
114
+ except:
115
+ outputDF = []
116
+ text = df.values.tolist()
117
+ if len(text) > 0:
118
+ outputRowDF = self.helperRowTableToCorpus(text)
119
+ #outputColDF = self.helperColTableToCorpus(text)
120
+ outputDF.extend(outputRowDF)
121
+ #outputDF.extend(outputColDF)
122
+ if len(outputDF) > 0:
123
+ corpus["corpus0"] = outputDF
124
+ else:
125
+ try:
126
+ df = pd.ExcelFile(excelFile)
127
+ except:
128
+ if excelFile.endswith('.xls'):
129
+ df = pd.read_excel(excelFile, engine='xlrd')
130
+ else:
131
+ df = pd.read_excel(excelFile, engine='openpyxl')
132
+ sheetNames = df.sheet_names
133
+ output = []
134
+ if len(sheetNames) > 0:
135
+ for s in range(len(sheetNames)):
136
+ outputDF = []
137
+ with pd.ExcelFile(excelFile) as xls:
138
+ data = pd.read_excel(xls, sheetNames[s])
139
+ if sheetNames[s] != 'Evaluation Warning':
140
+ text = data.values.tolist()
141
+ if len(text) > 0:
142
+ outputRowDF = self.helperRowTableToCorpus(text)
143
+ #outputColDF = self.helperColTableToCorpus(text)
144
+ outputDF.extend(outputRowDF)
145
+ #outputDF.extend(outputColDF)
146
+ if len(outputDF) > 0:
147
+ corpus["corpus" + str(s)] = outputDF
148
+ return corpus
149
+ def helperRowTableToCorpus(self, textList):
150
+ #cl = cleanGenText()
151
+ #cl = cleanText.cleanGenText()
152
+ stopWords = ["NaN","Unnamed:","nan"]
153
+ outputDF = []
154
+ for line in textList:
155
+ outputLine = []
156
+ for words in line:
157
+ words = str(words)
158
+ if len(words) > 0:
159
+ for word in words.split(" "):
160
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
161
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
162
+ #word = cl.splitStickWords(word)
163
+ word = self.cl.removePunct(word)
164
+ word = " ".join(self.cl.removeStopWords(word))
165
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
166
+ if len(word) > 1:
167
+ if len(word.split(" ")) > 1:
168
+ for x in word.split(" "):
169
+ if len(x) > 1 and x.isnumeric()==False:
170
+ outputLine.append(x.lower())
171
+ else:
172
+ if word.isnumeric() == False:
173
+ outputLine.append(word.lower())
174
+ if len(outputLine) > 0:
175
+ outputDF.append(outputLine)
176
+ return outputDF
177
+ def helperColTableToCorpus(self, dfList):
178
+ #cl = cleanGenText()
179
+ #cl = cleanText.cleanGenText()
180
+ stopWords = ["NaN","Unnamed:","nan"]
181
+ outputDF = []
182
+ # use the first length line as the column ref
183
+ for pos in range(len(dfList[0])):
184
+ outputLine = []
185
+ for line in dfList:
186
+ if pos < len(line):
187
+ words = line[pos]
188
+ words = str(words)
189
+ else: words = ""
190
+ if len(words) > 0:
191
+ for word in words.split(" "):
192
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
193
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
194
+ #word = cl.splitStickWords(word)
195
+ word = self.cl.removePunct(word)
196
+ word = " ".join(self.cl.removeStopWords(word))
197
+ word = self.cl.removeTabWhiteSpaceNewLine(word)
198
+ if len(word) > 1:
199
+ if len(word.split(" ")) > 1:
200
+ for x in word.split(" "):
201
+ if len(x) > 1 and x.isnumeric()==False:
202
+ outputLine.append(x.lower())
203
+ else:
204
+ if word.isnumeric() == False:
205
+ outputLine.append(word.lower())
206
+ if len(outputLine) > 0:
207
+ outputDF.append(outputLine)
208
+ return outputDF
209
+ # create a corpus
210
+ def createCorpusText(self, corpusText):
211
+ '''ex: "Tom is cat. Jerry is mouse."
212
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
213
+ # the output should be like this:
214
+ '''texts = {
215
+ "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
216
+ "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
217
+ }
218
+ '''
219
+ # separate paragraph
220
+ '''Ex: Cat is an animal. Tom is cat.
221
+
222
+ Mouse is an animal.
223
+ Jerry is mouse.'''
224
+ texts = {}
225
+ #cl = cleanText.cleanGenText()
226
+ #cl = cleanGenText()
227
+ corpus = corpusText.split("\n\n")
228
+ for pos in range(len(corpus)):
229
+ if len(corpus[pos]) > 0:
230
+ texts["Paragraph "+str(pos)] = []
231
+ lines = corpus[pos]
232
+ for line in lines.split("\n"):
233
+ for l in line.split("."):
234
+ if len(l) > 0:
235
+ l = self.cl.removeTabWhiteSpaceNewLine(l)
236
+ l = l.lower()
237
+ newL = []
238
+ for word in l.split(" "):
239
+ if len(word) > 0:
240
+ word = self.cl.removeStopWords(word)
241
+ for w in word:
242
+ if len(w) > 0 and w.isnumeric()==False:
243
+ newL.append(w)
244
+ if len(newL)>0:
245
+ texts["Paragraph "+str(pos)].append(newL)
246
+ if len(texts["Paragraph "+str(pos)]) == 0:
247
+ del texts["Paragraph "+str(pos)]
248
+ return texts
249
+
250
+ def selectParaForWC(self, corpus):
251
+ """
252
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
253
+ Heuristically determine Word2Vec parameters.
254
+ """
255
+ corSize = len(corpus)
256
+
257
+ if corSize == 0:
258
+ return None, None, None, None, None, None
259
+
260
+ # Adjust parameters based on corpus size
261
+ if corSize < 2000:
262
+ # Small corpus — need high generalization
263
+ window = 3
264
+ vector_size = 100
265
+ sample = 1e-3
266
+ negative = 5
267
+ epochs = 20
268
+ sg = 1 # Skip-gram preferred for rare words
269
+ elif corSize < 10000:
270
+ window = 5
271
+ vector_size = 150
272
+ sample = 1e-4
273
+ negative = 10
274
+ epochs = 20
275
+ sg = 1
276
+ elif corSize < 100000:
277
+ window = 7
278
+ vector_size = 200
279
+ sample = 1e-5
280
+ negative = 15
281
+ epochs = 15
282
+ sg = 1
283
+ elif corSize < 500000:
284
+ window = 10
285
+ vector_size = 250
286
+ sample = 1e-5
287
+ negative = 15
288
+ epochs = 10
289
+ sg = 0 # CBOW is okay when data is large
290
+ else:
291
+ # Very large corpus
292
+ window = 12
293
+ vector_size = 300
294
+ sample = 1e-6
295
+ negative = 20
296
+ epochs = 5
297
+ sg = 0
298
+
299
+ return window, vector_size, sample, negative, epochs, sg
300
+
301
+
302
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
303
+ vector_size=None,sample=None,negative=None,epochs=None,sg=None):
304
+ jsonFile = ""
305
+ jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
306
+ if not jsonFile:
307
+ print("No corpus to train")
308
+ return
309
+ cores = multiprocessing.cpu_count()
310
+ combinedCorpus = []
311
+ for key in jsonFile:
312
+ combinedCorpus.extend(jsonFile[key])
313
+ # detect phrase before choosing parameters
314
+ phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
315
+ bigram = Phraser(phrases)
316
+ combinedCorpus = [bigram[sent] for sent in combinedCorpus]
317
+
318
+ if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
319
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
320
+ # # min_count=1 ensures all words are included
321
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
322
+ accept = False
323
+ # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
324
+ retries = 0
325
+ while not accept and retries < 3:
326
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
327
+ try:
328
+ w2vModel = Word2Vec(
329
+ min_count=1,
330
+ window=window,
331
+ vector_size=vector_size,
332
+ sample=sample,
333
+ alpha=0.03,
334
+ min_alpha=0.0007,
335
+ negative=negative,
336
+ workers=cores-1,
337
+ epochs = epochs,
338
+ sg=sg)
339
+ w2vModel.build_vocab(combinedCorpus)
340
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
341
+ accept = True
342
+ except Exception as e:
343
+ print(f"Retry #{retries+1} failed: {e}")
344
+ retries +=1
345
+ else:
346
+ print("no parameter to train")
347
+ break
348
+ #w2vModel.build_vocab(combinedCorpus)
349
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
350
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
351
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
352
+ w2vModel.save(saveFolder+"/"+modelName+".model")
353
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
354
+ print("done w2v")
355
+ #return combinedCorpus
356
+ def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
357
+ if not newCorpus:
358
+ raise ValueError("New corpus is empty!")
359
+
360
+ model = Word2Vec.load(modelPath)
361
+
362
+ # Phrase detection on new data
363
+ phrases = Phrases(newCorpus, min_count=2, threshold=10)
364
+ bigram = Phraser(phrases)
365
+ newCorpus = [bigram[sent] for sent in newCorpus]
366
+
367
+ # Update vocab & retrain
368
+ model.build_vocab(newCorpus, update=True)
369
+ model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
370
+
371
+ def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
372
+ # might not be a meaningful keyword
373
+ #stopWords = ["show"]
374
+ # same word but just plural nouns, tense
375
+ simWords = [word+"s",word+"es",word+"ing",word+"ed"]
376
+ model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
377
+ results = model.most_similar(positive=[word],topn=n)
378
+ #removeIndex = []
379
+ #currN = copy.deepcopy(n)
380
+ '''for r in range(len(results)):
381
+ if len(results[r][0]) < 2:
382
+ removeIndex.append(results[r])
383
+ # remove the same word but just plural and singular noun and lower than the cos_thres
384
+ elif results[r][0] == word:
385
+ removeIndex.append(results[r])
386
+ elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
387
+ removeIndex.append(results[r])
388
+ for rem in removeIndex:
389
+ results.remove(rem)
390
+ while len(results)!=n and len(results) != 0:
391
+ moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
392
+ if moreNewResult not in results and len(moreNewResult[0])>1:
393
+ if moreNewResult[0] not in stopWords and results[0] != word:
394
+ results.append(moreNewResult)
395
+ currN +=1'''
396
+ return results
397
+ # add more data to existing word2vec model
398
+ def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
399
+ if not newCorpus:
400
+ raise ValueError("New corpus is empty!")
401
+
402
+ model = Word2Vec.load(modelPath)
403
+
404
+ # Phrase detection on new data
405
+ phrases = Phrases(newCorpus, min_count=2, threshold=10)
406
+ bigram = Phraser(phrases)
407
+ newCorpus = [bigram[sent] for sent in newCorpus]
408
+
409
+ # Update vocab & retrain
410
+ model.build_vocab(newCorpus, update=True)
411
+ model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
412
+
413
+ # Save updated model
414
+ if saveFolder:
415
+ os.makedirs(saveFolder, exist_ok=True)
416
+ name = os.path.basename(modelPath).replace(".model", "_updated.model")
417
+ model.save(f"{saveFolder}/{name}")
418
+ print(f"🔁 Model updated and saved to {saveFolder}/{name}")
419
+ else:
420
+ model.save(modelPath)
421
+ print(f"🔁 Model updated and overwritten at {modelPath}")
422
+
423
+ # adding our model into spacy
424
+ # this deals with command line; but instead of using it, we write python script to run command line
425
+ def loadWordVec(self,modelName,wordVec):
426
+ # modelName is the name you want to save into spacy
427
+ # wordVec is the trained word2vec in txt format
428
+ subprocess.run([sys.executable,
429
+ "-m",
430
+ "spacy",
431
+ "init-model",
432
+ "en",
433
+ modelName, # this modelName comes from the saved modelName of function trainWord2Vec
434
+ "--vectors-loc",
435
+ wordVec])
436
+ print("done")
core/__pycache__/data_preprocess.cpython-310.pyc ADDED
Binary file (16.9 kB). View file
 
core/__pycache__/drive_utils.cpython-310.pyc ADDED
Binary file (3.99 kB). View file
 
core/__pycache__/model.cpython-310.pyc ADDED
Binary file (26.6 kB). View file
 
core/__pycache__/mtdna_backend.cpython-310.pyc ADDED
Binary file (9.57 kB). View file
 
core/__pycache__/mtdna_classifier.cpython-310.pyc ADDED
Binary file (20.6 kB). View file
 
core/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (15.9 kB). View file
 
core/__pycache__/smart_fallback.cpython-310.pyc ADDED
Binary file (6.1 kB). View file
 
core/__pycache__/standardize_location.cpython-310.pyc ADDED
Binary file (2.38 kB). View file
 
core/__pycache__/upgradeClassify.cpython-310.pyc ADDED
Binary file (8 kB). View file
 
core/data_preprocess.py ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, os, json, tempfile, subprocess, nltk
2
+
3
+ #import streamlit as st
4
+ from Bio import Entrez
5
+ from docx import Document
6
+ import fitz
7
+ import spacy
8
+ from spacy.cli import download
9
+
10
+ import core.model
11
+ import core.pipeline
12
+ from core.drive_utils import upload_file_to_drive
13
+ from core.NER.PDF import pdf
14
+ from core.NER.WordDoc import wordDoc
15
+ from core.NER.html import extractHTML
16
+ from core.NER.word2Vec import word2vec
17
+ #from transformers import pipeline
18
+ import urllib.parse, requests
19
+ from pathlib import Path
20
+ import pandas as pd
21
+
22
+
23
+ nltk.download('punkt_tab')
24
+ def download_excel_file(url, save_path="temp.xlsx"):
25
+ if "view.officeapps.live.com" in url:
26
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
28
+ response = requests.get(real_url)
29
+ with open(save_path, "wb") as f:
30
+ f.write(response.content)
31
+ return save_path
32
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
+ response = requests.get(url)
34
+ response.raise_for_status() # Raises error if download fails
35
+ with open(save_path, "wb") as f:
36
+ f.write(response.content)
37
+ print(len(response.content))
38
+ return save_path
39
+ else:
40
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
+ return url
42
+ def extract_text(link,saveFolder):
43
+ try:
44
+ text = ""
45
+ name = link.split("/")[-1]
46
+ print("name: ", name)
47
+ #file_path = Path(saveFolder) / name
48
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
49
+ print("this is local temp path: ", local_temp_path)
50
+ if os.path.exists(local_temp_path):
51
+ input_to_class = local_temp_path
52
+ print("exist")
53
+ else:
54
+ #input_to_class = link # Let the class handle downloading
55
+ # 1. Check if file exists in shared Google Drive folder
56
+ file_id = pipeline.find_drive_file(name, saveFolder)
57
+ if file_id:
58
+ print("📥 Downloading from Google Drive...")
59
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
60
+ else:
61
+ print("🌐 Downloading from web link...")
62
+ response = requests.get(link)
63
+ with open(local_temp_path, 'wb') as f:
64
+ f.write(response.content)
65
+ print("✅ Saved locally.")
66
+
67
+ # 2. Upload to Drive so it's available for later
68
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
69
+
70
+ input_to_class = local_temp_path
71
+ print(input_to_class)
72
+ # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
73
+ # pdf
74
+ if link.endswith(".pdf"):
75
+ # if file_path.is_file():
76
+ # link = saveFolder + "/" + name
77
+ # print("File exists.")
78
+ #p = pdf.PDF(local_temp_path, saveFolder)
79
+ print("inside pdf and input to class: ", input_to_class)
80
+ print("save folder in extract text: ", saveFolder)
81
+ p = pdf.PDF(input_to_class, saveFolder)
82
+ #p = pdf.PDF(link,saveFolder)
83
+ #text = p.extractTextWithPDFReader()
84
+ text = p.extractText()
85
+ print("text from pdf:")
86
+ print(text)
87
+ #text_exclude_table = p.extract_text_excluding_tables()
88
+ # worddoc
89
+ elif link.endswith(".doc") or link.endswith(".docx"):
90
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
91
+ d = wordDoc.wordDoc(input_to_class,saveFolder)
92
+ text = d.extractTextByPage()
93
+ # html
94
+ else:
95
+ if link.split(".")[-1].lower() not in "xlsx":
96
+ if "http" in link or "html" in link:
97
+ print("html link: ", link)
98
+ html = extractHTML.HTML("",link)
99
+ text = html.getListSection() # the text already clean
100
+ print("text html: ")
101
+ print(text)
102
+ # Cleanup: delete the local temp file
103
+ if name:
104
+ if os.path.exists(local_temp_path):
105
+ os.remove(local_temp_path)
106
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
107
+ print("done extract text")
108
+ except:
109
+ text = ""
110
+ return text
111
+
112
+ def extract_table(link,saveFolder):
113
+ try:
114
+ table = []
115
+ name = link.split("/")[-1]
116
+ #file_path = Path(saveFolder) / name
117
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
118
+ if os.path.exists(local_temp_path):
119
+ input_to_class = local_temp_path
120
+ print("exist")
121
+ else:
122
+ #input_to_class = link # Let the class handle downloading
123
+ # 1. Check if file exists in shared Google Drive folder
124
+ file_id = pipeline.find_drive_file(name, saveFolder)
125
+ if file_id:
126
+ print("📥 Downloading from Google Drive...")
127
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
128
+ else:
129
+ print("🌐 Downloading from web link...")
130
+ response = requests.get(link)
131
+ with open(local_temp_path, 'wb') as f:
132
+ f.write(response.content)
133
+ print("✅ Saved locally.")
134
+
135
+ # 2. Upload to Drive so it's available for later
136
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
137
+
138
+ input_to_class = local_temp_path
139
+ print(input_to_class)
140
+ #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
141
+ # pdf
142
+ if link.endswith(".pdf"):
143
+ # if file_path.is_file():
144
+ # link = saveFolder + "/" + name
145
+ # print("File exists.")
146
+ #p = pdf.PDF(local_temp_path,saveFolder)
147
+ p = pdf.PDF(input_to_class,saveFolder)
148
+ table = p.extractTable()
149
+ # worddoc
150
+ elif link.endswith(".doc") or link.endswith(".docx"):
151
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
152
+ d = wordDoc.wordDoc(input_to_class,saveFolder)
153
+ table = d.extractTableAsList()
154
+ # excel
155
+ elif link.split(".")[-1].lower() in "xlsx":
156
+ # download excel file if it not downloaded yet
157
+ savePath = saveFolder +"/"+ link.split("/")[-1]
158
+ excelPath = download_excel_file(link, savePath)
159
+ try:
160
+ #xls = pd.ExcelFile(excelPath)
161
+ xls = pd.ExcelFile(local_temp_path)
162
+ table_list = []
163
+ for sheet_name in xls.sheet_names:
164
+ df = pd.read_excel(xls, sheet_name=sheet_name)
165
+ cleaned_table = df.fillna("").astype(str).values.tolist()
166
+ table_list.append(cleaned_table)
167
+ table = table_list
168
+ except Exception as e:
169
+ print("❌ Failed to extract tables from Excel:", e)
170
+ # html
171
+ elif "http" in link or "html" in link:
172
+ html = extractHTML.HTML("",link)
173
+ table = html.extractTable() # table is a list
174
+ table = clean_tables_format(table)
175
+ # Cleanup: delete the local temp file
176
+ if os.path.exists(local_temp_path):
177
+ os.remove(local_temp_path)
178
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
179
+ except:
180
+ table = []
181
+ return table
182
+
183
+ def clean_tables_format(tables):
184
+ """
185
+ Ensures all tables are in consistent format: List[List[List[str]]]
186
+ Cleans by:
187
+ - Removing empty strings and rows
188
+ - Converting all cells to strings
189
+ - Handling DataFrames and list-of-lists
190
+ """
191
+ cleaned = []
192
+ if tables:
193
+ for table in tables:
194
+ standardized = []
195
+
196
+ # Case 1: Pandas DataFrame
197
+ if isinstance(table, pd.DataFrame):
198
+ table = table.fillna("").astype(str).values.tolist()
199
+
200
+ # Case 2: List of Lists
201
+ if isinstance(table, list) and all(isinstance(row, list) for row in table):
202
+ for row in table:
203
+ filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
204
+ if filtered_row:
205
+ standardized.append(filtered_row)
206
+
207
+ if standardized:
208
+ cleaned.append(standardized)
209
+
210
+ return cleaned
211
+
212
+ def normalize_text_for_comparison(s: str) -> str:
213
+ """
214
+ Normalizes text for robust comparison by:
215
+ 1. Converting to lowercase.
216
+ 2. Replacing all types of newlines with a single consistent newline (\n).
217
+ 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
218
+ 4. Stripping leading/trailing whitespace from the entire string.
219
+ """
220
+ s = s.lower()
221
+ s = s.replace('\r\n', '\n') # Handle Windows newlines
222
+ s = s.replace('\r', '\n') # Handle Mac classic newlines
223
+
224
+ # Replace sequences of whitespace (including multiple newlines) with a single space
225
+ # This might be too aggressive if you need to preserve paragraph breaks,
226
+ # but good for exact word-sequence matching.
227
+ s = re.sub(r'\s+', ' ', s)
228
+
229
+ return s.strip()
230
+ def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
231
+ """
232
+ Merge cleaned text and table into one string for LLM input.
233
+ - Avoids duplicating tables already in text
234
+ - Extracts only relevant rows from large tables
235
+ - Skips or saves oversized tables
236
+ """
237
+ import importlib
238
+ json = importlib.import_module("json")
239
+
240
+ def estimate_tokens(text_str):
241
+ try:
242
+ enc = tiktoken.get_encoding(tokenizer)
243
+ return len(enc.encode(text_str))
244
+ except:
245
+ return len(text_str) // 4 # Fallback estimate
246
+
247
+ def is_table_relevant(table, keywords, accession_id=None):
248
+ flat = " ".join(" ".join(row).lower() for row in table)
249
+ if accession_id and accession_id.lower() in flat:
250
+ return True
251
+ return any(kw.lower() in flat for kw in keywords)
252
+ preview, preview1 = "",""
253
+ llm_input = "## Document Text\n" + text.strip() + "\n"
254
+ clean_text = normalize_text_for_comparison(text)
255
+
256
+ if tables:
257
+ for idx, table in enumerate(tables):
258
+ keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
259
+ if accession_id: keywords += [accession_id.lower()]
260
+ if isolate: keywords += [isolate.lower()]
261
+ if is_table_relevant(table, keywords, accession_id):
262
+ if len(table) > 0:
263
+ for tab in table:
264
+ preview = " ".join(tab) if tab else ""
265
+ preview1 = "\n".join(tab) if tab else ""
266
+ clean_preview = normalize_text_for_comparison(preview)
267
+ clean_preview1 = normalize_text_for_comparison(preview1)
268
+ if clean_preview not in clean_text:
269
+ if clean_preview1 not in clean_text:
270
+ table_str = json.dumps([tab], indent=2)
271
+ llm_input += f"## Table {idx+1}\n{table_str}\n"
272
+ return llm_input.strip()
273
+
274
+ def preprocess_document(link, saveFolder, accession=None, isolate=None):
275
+ try:
276
+ text = extract_text(link, saveFolder)
277
+ print("text and link")
278
+ print(link)
279
+ print(text)
280
+ except: text = ""
281
+ try:
282
+ tables = extract_table(link, saveFolder)
283
+ except: tables = []
284
+ if accession: accession = accession
285
+ if isolate: isolate = isolate
286
+ try:
287
+ final_input = merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
288
+ except: final_input = ""
289
+ return text, tables, final_input
290
+
291
+ def extract_sentences(text):
292
+ sentences = re.split(r'(?<=[.!?])\s+', text)
293
+ return [s.strip() for s in sentences if s.strip()]
294
+
295
+ def is_irrelevant_number_sequence(text):
296
+ if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
297
+ return False
298
+ word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
299
+ number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
300
+ total_tokens = len(re.findall(r'\S+', text))
301
+ if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
302
+ return True
303
+ elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
304
+ return True
305
+ return False
306
+
307
+ def remove_isolated_single_digits(sentence):
308
+ tokens = sentence.split()
309
+ filtered_tokens = []
310
+ for token in tokens:
311
+ if token == '0' or token == '1':
312
+ pass
313
+ else:
314
+ filtered_tokens.append(token)
315
+ return ' '.join(filtered_tokens).strip()
316
+
317
+ def get_contextual_sentences_BFS(text_content, keyword, depth=2):
318
+ def extract_codes(sentence):
319
+ # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
320
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
321
+ sentences = extract_sentences(text_content)
322
+ relevant_sentences = set()
323
+ initial_keywords = set()
324
+
325
+ # Define a regex to capture codes like A1YU101 or KM1
326
+ # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
327
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
328
+
329
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
330
+ keyword_match = code_pattern.search(keyword)
331
+
332
+ keyword_prefix = None
333
+ keyword_num = None
334
+
335
+ if keyword_match:
336
+ keyword_prefix = keyword_match.group(1).lower()
337
+ keyword_num = int(keyword_match.group(2))
338
+
339
+ for sentence in sentences:
340
+ sentence_added = False
341
+
342
+ # 1. Check for exact match of the keyword
343
+ if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
344
+ relevant_sentences.add(sentence.strip())
345
+ initial_keywords.add(keyword.lower())
346
+ sentence_added = True
347
+
348
+ # 2. Check for range patterns (e.g., A1YU101-A1YU137)
349
+ # The range pattern should be broad enough to capture the full code string within the range.
350
+ range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
351
+ range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
352
+
353
+ for r_match in range_matches:
354
+ start_code_str = r_match.group(1)
355
+ end_code_str = r_match.group(2)
356
+
357
+ # CRITICAL FIX: Use code_pattern.search for start_match and end_match
358
+ start_match = code_pattern.search(start_code_str)
359
+ end_match = code_pattern.search(end_code_str)
360
+
361
+ if keyword_prefix and keyword_num is not None and start_match and end_match:
362
+ start_prefix = start_match.group(1).lower()
363
+ end_prefix = end_match.group(1).lower()
364
+ start_num = int(start_match.group(2))
365
+ end_num = int(end_match.group(2))
366
+
367
+ # Check if the keyword's prefix matches and its number is within the range
368
+ if keyword_prefix == start_prefix and \
369
+ keyword_prefix == end_prefix and \
370
+ start_num <= keyword_num <= end_num:
371
+ relevant_sentences.add(sentence.strip())
372
+ initial_keywords.add(start_code_str.lower())
373
+ initial_keywords.add(end_code_str.lower())
374
+ sentence_added = True
375
+ break # Only need to find one matching range per sentence
376
+
377
+ # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
378
+ # to initial_keywords to ensure graph traversal from related terms.
379
+ if sentence_added:
380
+ for word in extract_codes(sentence):
381
+ initial_keywords.add(word.lower())
382
+
383
+
384
+ # Build word_to_sentences mapping for all sentences
385
+ word_to_sentences = {}
386
+ for sent in sentences:
387
+ codes_in_sent = set(extract_codes(sent))
388
+ for code in codes_in_sent:
389
+ word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
390
+
391
+
392
+ # Build the graph
393
+ graph = {}
394
+ for sent in sentences:
395
+ codes = set(extract_codes(sent))
396
+ for word1 in codes:
397
+ word1_lower = word1.lower()
398
+ graph.setdefault(word1_lower, set())
399
+ for word2 in codes:
400
+ word2_lower = word2.lower()
401
+ if word1_lower != word2_lower:
402
+ graph[word1_lower].add(word2_lower)
403
+
404
+
405
+ # Perform BFS/graph traversal
406
+ queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
407
+ visited_words = set(initial_keywords)
408
+
409
+ while queue:
410
+ current_word, level = queue.pop(0)
411
+ if level >= depth:
412
+ continue
413
+
414
+ relevant_sentences.update(word_to_sentences.get(current_word, []))
415
+
416
+ for neighbor in graph.get(current_word, []):
417
+ if neighbor not in visited_words:
418
+ visited_words.add(neighbor)
419
+ queue.append((neighbor, level + 1))
420
+
421
+ final_sentences = set()
422
+ for sentence in relevant_sentences:
423
+ if not is_irrelevant_number_sequence(sentence):
424
+ processed_sentence = remove_isolated_single_digits(sentence)
425
+ if processed_sentence:
426
+ final_sentences.add(processed_sentence)
427
+
428
+ return "\n".join(sorted(list(final_sentences)))
429
+
430
+
431
+
432
+ def get_contextual_sentences_DFS(text_content, keyword, depth=2):
433
+ sentences = extract_sentences(text_content)
434
+
435
+ # Build word-to-sentences mapping
436
+ word_to_sentences = {}
437
+ for sent in sentences:
438
+ words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
439
+ for word in words_in_sent:
440
+ word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
441
+
442
+ # Function to extract codes in a sentence
443
+ def extract_codes(sentence):
444
+ # Only codes like 'KSK1', 'MG272794', not pure numbers
445
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
446
+
447
+ # DFS with priority based on distance to keyword and early stop if country found
448
+ def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
449
+ country = "unknown"
450
+ if current_depth > max_depth:
451
+ return country, False
452
+
453
+ if current_word not in word_to_sentences:
454
+ return country, False
455
+
456
+ for sentence in word_to_sentences[current_word]:
457
+ if sentence == parent_sentence:
458
+ continue # avoid reusing the same sentence
459
+
460
+ collected_sentences.add(sentence)
461
+
462
+ #print("current_word:", current_word)
463
+ small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
464
+ #print(small_sen)
465
+ country = model.get_country_from_text(small_sen)
466
+ #print("small context country:", country)
467
+ if country.lower() != "unknown":
468
+ return country, True
469
+ else:
470
+ country = model.get_country_from_text(sentence)
471
+ #print("full sentence country:", country)
472
+ if country.lower() != "unknown":
473
+ return country, True
474
+
475
+ codes_in_sentence = extract_codes(sentence)
476
+ idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
477
+ if idx is None:
478
+ continue
479
+
480
+ sorted_children = sorted(
481
+ [code for code in codes_in_sentence if code.lower() not in visited_words],
482
+ key=lambda x: (abs(codes_in_sentence.index(x) - idx),
483
+ 0 if codes_in_sentence.index(x) > idx else 1)
484
+ )
485
+
486
+ #print("sorted_children:", sorted_children)
487
+ for child in sorted_children:
488
+ child_lower = child.lower()
489
+ if child_lower not in visited_words:
490
+ visited_words.add(child_lower)
491
+ country, should_stop = dfs_traverse(
492
+ child_lower, current_depth + 1, max_depth,
493
+ visited_words, collected_sentences, parent_sentence=sentence
494
+ )
495
+ if should_stop:
496
+ return country, True
497
+
498
+ return country, False
499
+
500
+ # Begin DFS
501
+ collected_sentences = set()
502
+ visited_words = set([keyword.lower()])
503
+ country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
504
+
505
+ # Filter irrelevant sentences
506
+ final_sentences = set()
507
+ for sentence in collected_sentences:
508
+ if not is_irrelevant_number_sequence(sentence):
509
+ processed = remove_isolated_single_digits(sentence)
510
+ if processed:
511
+ final_sentences.add(processed)
512
+ if not final_sentences:
513
+ return country, text_content
514
+ return country, "\n".join(sorted(list(final_sentences)))
515
+
516
+ # Helper function for normalizing text for overlap comparison
517
+ def normalize_for_overlap(s: str) -> str:
518
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
519
+ s = re.sub(r'\s+', ' ', s).strip()
520
+ return s
521
+
522
+ def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
523
+ if not text1: return text2
524
+ if not text2: return text1
525
+
526
+ # Case 1: text2 is fully contained in text1 or vice-versa
527
+ if text2 in text1:
528
+ return text1
529
+ if text1 in text2:
530
+ return text2
531
+
532
+ # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
533
+ # This is what your function was primarily designed for.
534
+ # It looks for the overlap at the "junction" of text1 and text2.
535
+
536
+ max_junction_overlap = 0
537
+ for i in range(min(len(text1), len(text2)), 0, -1):
538
+ suffix1 = text1[-i:]
539
+ prefix2 = text2[:i]
540
+ # Prioritize exact match, then normalized match
541
+ if suffix1 == prefix2:
542
+ max_junction_overlap = i
543
+ break
544
+ elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
545
+ max_junction_overlap = i
546
+ break # Take the first (longest) normalized match
547
+
548
+ if max_junction_overlap > 0:
549
+ merged_text = text1 + text2[max_junction_overlap:]
550
+ return re.sub(r'\s+', ' ', merged_text).strip()
551
+
552
+ # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
553
+ # This addresses your specific test case where the overlap is at the very beginning of both strings.
554
+ # This is often used when trying to deduplicate content that shares a common start.
555
+
556
+ longest_common_prefix_len = 0
557
+ min_len = min(len(text1), len(text2))
558
+ for i in range(min_len):
559
+ if text1[i] == text2[i]:
560
+ longest_common_prefix_len = i + 1
561
+ else:
562
+ break
563
+
564
+ # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
565
+ # AND the remaining parts are distinct, then apply this merge.
566
+ # This is a heuristic and might need fine-tuning.
567
+ if longest_common_prefix_len > 0 and \
568
+ text1[longest_common_prefix_len:].strip() and \
569
+ text2[longest_common_prefix_len:].strip():
570
+
571
+ # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
572
+ # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
573
+ # common prefix is "Hi, I am Vy."
574
+ # Remaining text1: " Nice to meet you."
575
+ # Remaining text2: " Goodbye Vy."
576
+ # So we merge common_prefix + remaining_text1 + remaining_text2
577
+
578
+ common_prefix_str = text1[:longest_common_prefix_len]
579
+ remainder_text1 = text1[longest_common_prefix_len:]
580
+ remainder_text2 = text2[longest_common_prefix_len:]
581
+
582
+ merged_text = common_prefix_str + remainder_text1 + remainder_text2
583
+ return re.sub(r'\s+', ' ', merged_text).strip()
584
+
585
+
586
+ # If neither specific overlap type is found, just concatenate
587
+ merged_text = text1 + text2
588
+ return re.sub(r'\s+', ' ', merged_text).strip()
589
+
590
+ # def save_text_to_docx(text_content: str, file_path: str):
591
+ # """
592
+ # Saves a given text string into a .docx file.
593
+
594
+ # Args:
595
+ # text_content (str): The text string to save.
596
+ # file_path (str): The full path including the filename where the .docx file will be saved.
597
+ # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
598
+ # """
599
+ # try:
600
+ # document = Document()
601
+
602
+ # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
603
+ # for paragraph_text in text_content.split('\n'):
604
+ # document.add_paragraph(paragraph_text)
605
+
606
+ # document.save(file_path)
607
+ # print(f"Text successfully saved to '{file_path}'")
608
+ # except Exception as e:
609
+ # print(f"Error saving text to docx file: {e}")
610
+ # def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
611
+ # """
612
+ # Saves a given text string into a .docx file locally, then uploads to Google Drive.
613
+
614
+ # Args:
615
+ # text_content (str): The text string to save.
616
+ # filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
617
+ # drive_folder_id (str): Google Drive folder ID where to upload the file.
618
+ # """
619
+ # try:
620
+ # # ✅ Save to temporary local path first
621
+ # print("file name: ", filename)
622
+ # print("length text content: ", len(text_content))
623
+ # local_path = os.path.join(tempfile.gettempdir(), filename)
624
+ # document = Document()
625
+ # for paragraph_text in text_content.split('\n'):
626
+ # document.add_paragraph(paragraph_text)
627
+ # document.save(local_path)
628
+ # print(f"✅ Text saved locally to: {local_path}")
629
+
630
+ # # ✅ Upload to Drive
631
+ # pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
632
+ # print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
633
+
634
+ # except Exception as e:
635
+ # print(f"❌ Error saving or uploading DOCX: {e}")
636
+ def save_text_to_docx(text_content: str, full_local_path: str):
637
+ document = Document()
638
+ for paragraph_text in text_content.split('\n'):
639
+ document.add_paragraph(paragraph_text)
640
+ document.save(full_local_path)
641
+ print(f"✅ Saved DOCX locally: {full_local_path}")
642
+
643
+
644
+
645
+ '''2 scenerios:
646
+ - quick look then found then deepdive and directly get location then stop
647
+ - quick look then found then deepdive but not find location then hold the related words then
648
+ look another files iteratively for each related word and find location and stop'''
649
+ def extract_context(text, keyword, window=500):
650
+ # firstly try accession number
651
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
652
+
653
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
654
+ keyword_match = code_pattern.search(keyword)
655
+
656
+ keyword_prefix = None
657
+ keyword_num = None
658
+
659
+ if keyword_match:
660
+ keyword_prefix = keyword_match.group(1).lower()
661
+ keyword_num = int(keyword_match.group(2))
662
+ text = text.lower()
663
+ idx = text.find(keyword.lower())
664
+ if idx == -1:
665
+ if keyword_prefix:
666
+ idx = text.find(keyword_prefix)
667
+ if idx == -1:
668
+ return "Sample ID not found."
669
+ return text[max(0, idx-window): idx+window]
670
+ return text[max(0, idx-window): idx+window]
671
+ def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
672
+ cache = {}
673
+ country = "unknown"
674
+ output = ""
675
+ tem_output, small_output = "",""
676
+ keyword_appear = (False,"")
677
+ keywords = []
678
+ if isolate: keywords.append(isolate)
679
+ if accession: keywords.append(accession)
680
+ for f in filePaths:
681
+ # scenerio 1: direct location: truncate the context and then use qa model?
682
+ if keywords:
683
+ for keyword in keywords:
684
+ text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
685
+ if keyword in final_input:
686
+ context = extract_context(final_input, keyword)
687
+ # quick look if country already in context and if yes then return
688
+ country = model.get_country_from_text(context)
689
+ if country != "unknown":
690
+ return country, context, final_input
691
+ else:
692
+ country = model.get_country_from_text(final_input)
693
+ if country != "unknown":
694
+ return country, context, final_input
695
+ else: # might be cross-ref
696
+ keyword_appear = (True, f)
697
+ cache[f] = context
698
+ small_output = merge_texts_skipping_overlap(output, context) + "\n"
699
+ chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
700
+ countryBFS = model.get_country_from_text(chunkBFS)
701
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
702
+ output = merge_texts_skipping_overlap(output, final_input)
703
+ if countryDFS != "unknown" and countryBFS != "unknown":
704
+ if len(chunkDFS) <= len(chunkBFS):
705
+ return countryDFS, chunkDFS, output
706
+ else:
707
+ return countryBFS, chunkBFS, output
708
+ else:
709
+ if countryDFS != "unknown":
710
+ return countryDFS, chunkDFS, output
711
+ if countryBFS != "unknown":
712
+ return countryBFS, chunkBFS, output
713
+ else:
714
+ # scenerio 2:
715
+ '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
716
+ but if we look at file 1 first then maybe we can have lookup dict which country
717
+ such as Thailand as the key and its re'''
718
+ cache[f] = final_input
719
+ if keyword_appear[0] == True:
720
+ for c in cache:
721
+ if c!=keyword_appear[1]:
722
+ if cache[c].lower() not in output.lower():
723
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
724
+ chunkBFS = get_contextual_sentences_BFS(output, keyword)
725
+ countryBFS = model.get_country_from_text(chunkBFS)
726
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
727
+ if countryDFS != "unknown" and countryBFS != "unknown":
728
+ if len(chunkDFS) <= len(chunkBFS):
729
+ return countryDFS, chunkDFS, output
730
+ else:
731
+ return countryBFS, chunkBFS, output
732
+ else:
733
+ if countryDFS != "unknown":
734
+ return countryDFS, chunkDFS, output
735
+ if countryBFS != "unknown":
736
+ return countryBFS, chunkBFS, output
737
+ else:
738
+ if cache[f].lower() not in output.lower():
739
+ output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
740
+ if len(output) == 0 or keyword_appear[0]==False:
741
+ for c in cache:
742
+ if cache[c].lower() not in output.lower():
743
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
744
+ return country, "", output
core/drive_utils.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ # Google Drive (optional)
3
+ from google.oauth2.service_account import Credentials
4
+ from googleapiclient.discovery import build
5
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
6
+ import gspread
7
+ from oauth2client.service_account import ServiceAccountCredentials
8
+
9
+ import os, io, time, re, json
10
+
11
+ #––– Authentication setup –––
12
+ GDRIVE_PARENT_FOLDER_NAME = "mtDNA-Location-Classifier"
13
+ GDRIVE_DATA_FOLDER_NAME = os.environ["GDRIVE_DATA_FOLDER_NAME"]
14
+ GCP_CREDS_DICT = json.loads(os.environ["GCP_CREDS_JSON"]) # from HF secrets
15
+ GDRIVE_CREDS = Credentials.from_service_account_info(GCP_CREDS_DICT, scopes=["https://www.googleapis.com/auth/drive"])
16
+ drive_service = build("drive", "v3", credentials=GDRIVE_CREDS)
17
+
18
+ def get_or_create_drive_folder(name, parent_id=None):
19
+ query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
20
+ if parent_id:
21
+ query += f" and '{parent_id}' in parents"
22
+ results = drive_service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
23
+ items = results.get("files", [])
24
+ if items:
25
+ return items[0]["id"]
26
+ file_metadata = {
27
+ "name": name,
28
+ "mimeType": "application/vnd.google-apps.folder"
29
+ }
30
+ if parent_id:
31
+ file_metadata["parents"] = [parent_id]
32
+ file = drive_service.files().create(body=file_metadata, fields="id").execute()
33
+ return file["id"]
34
+ # def find_drive_file(filename, parent_id):
35
+ # """
36
+ # Checks if a file with the given name exists inside the specified Google Drive folder.
37
+ # Returns the file ID if found, else None.
38
+ # """
39
+ # query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
40
+ # results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)', pageSize=1).execute()
41
+ # files = results.get('files', [])
42
+ # if files:
43
+ # return files[0]["id"]
44
+ # return None
45
+
46
+ def find_drive_file(filename, parent_id):
47
+ """
48
+ Checks if a file with the given name exists inside the specified Google Drive folder.
49
+ Returns the file ID if found, else None.
50
+ """
51
+ try:
52
+ print(f"🔍 Searching for '{filename}' in folder: {parent_id}")
53
+ query = f"'{parent_id}' in parents and name = '{filename}' and trashed = false"
54
+ results = drive_service.files().list(
55
+ q=query,
56
+ spaces='drive',
57
+ fields='files(id, name)',
58
+ pageSize=1
59
+ ).execute()
60
+ files = results.get('files', [])
61
+ if files:
62
+ print(f"✅ Found file: {files[0]['name']} with ID: {files[0]['id']}")
63
+ return files[0]["id"]
64
+ else:
65
+ print("⚠️ File not found.")
66
+ return None
67
+ except Exception as e:
68
+ print(f"❌ Error during find_drive_file: {e}")
69
+ return None
70
+
71
+
72
+
73
+ # def upload_file_to_drive(local_path, remote_name, folder_id):
74
+ # file_metadata = {"name": remote_name, "parents": [folder_id]}
75
+ # media = MediaFileUpload(local_path, resumable=True)
76
+ # existing = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute().get("files", [])
77
+ # if existing:
78
+ # drive_service.files().delete(fileId=existing[0]["id"]).execute()
79
+ # file = drive_service.files().create(body=file_metadata, media_body=media, fields="id").execute()
80
+ # result = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
81
+ # if not result.get("files"):
82
+ # print(f"❌ Upload failed: File '{remote_name}' not found in folder after upload.")
83
+ # else:
84
+ # print(f"✅ Verified upload: {remote_name}")
85
+ # return file["id"]
86
+ def upload_file_to_drive(local_path, remote_name, folder_id):
87
+ try:
88
+ if not os.path.exists(local_path):
89
+ raise FileNotFoundError(f"❌ Local file does not exist: {local_path}")
90
+
91
+ # Delete existing file on Drive if present
92
+ existing = drive_service.files().list(
93
+ q=f"name='{remote_name}' and '{folder_id}' in parents and trashed = false",
94
+ fields="files(id)"
95
+ ).execute().get("files", [])
96
+
97
+ if existing:
98
+ drive_service.files().delete(fileId=existing[0]["id"]).execute()
99
+ print(f"🗑️ Deleted existing '{remote_name}' in Drive folder {folder_id}")
100
+
101
+ file_metadata = {"name": remote_name, "parents": [folder_id]}
102
+ media = MediaFileUpload(local_path, resumable=True)
103
+ file = drive_service.files().create(
104
+ body=file_metadata,
105
+ media_body=media,
106
+ fields="id"
107
+ ).execute()
108
+
109
+ print(f"✅ Uploaded '{remote_name}' to Google Drive folder ID: {folder_id}")
110
+ return file["id"]
111
+
112
+ except Exception as e:
113
+ print(f"❌ Error during upload: {e}")
114
+ return None
115
+
116
+
117
+ def download_file_from_drive(remote_name, folder_id, local_path):
118
+ results = drive_service.files().list(q=f"name='{remote_name}' and '{folder_id}' in parents", fields="files(id)").execute()
119
+ files = results.get("files", [])
120
+ if not files:
121
+ return False
122
+ file_id = files[0]["id"]
123
+ request = drive_service.files().get_media(fileId=file_id)
124
+ fh = io.FileIO(local_path, 'wb')
125
+ downloader = MediaIoBaseDownload(fh, request)
126
+ done = False
127
+ while not done:
128
+ _, done = downloader.next_chunk()
129
+ return True
130
+ def download_drive_file_content(file_id):
131
+ request = drive_service.files().get_media(fileId=file_id)
132
+ fh = io.BytesIO()
133
+ downloader = MediaIoBaseDownload(fh, request)
134
+ done = False
135
+ while not done:
136
+ _, done = downloader.next_chunk()
137
+ fh.seek(0)
138
+ return fh.read().decode("utf-8")
core/model.py ADDED
@@ -0,0 +1,1414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, os, json
2
+ import pycountry, faiss
3
+ from docx import Document
4
+ import numpy as np
5
+ from collections import defaultdict
6
+ import ast # For literal_eval
7
+ import math # For ceiling function
8
+ import core.data_preprocess
9
+ import core.mtdna_classifier
10
+ # --- IMPORTANT: UNCOMMENT AND CONFIGURE YOUR REAL API KEY ---
11
+ import google.generativeai as genai
12
+
13
+ #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
14
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
15
+
16
+ import nltk
17
+ from nltk.corpus import stopwords
18
+ try:
19
+ nltk.data.find('corpora/stopwords')
20
+ except LookupError:
21
+ nltk.download('stopwords')
22
+ nltk.download('punkt_tab')
23
+ # # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
24
+ # # Prices are per 1,000 tokens
25
+ # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
26
+ # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
27
+ # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
28
+
29
+ # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
30
+ PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
31
+ PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
32
+
33
+ # Embedding-001 pricing per 1,000 input tokens
34
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
35
+ # --- API Functions (REAL API FUNCTIONS) ---
36
+
37
+ # def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
38
+ # """Generates an embedding for the given text using a Google embedding model."""
39
+ # try:
40
+ # result = genai.embed_content(
41
+ # model="models/text-embedding-004", # Specify the embedding model
42
+ # content=text,
43
+ # task_type=task_type
44
+ # )
45
+ # return np.array(result['embedding']).astype('float32')
46
+ # except Exception as e:
47
+ # print(f"Error getting embedding: {e}")
48
+ # return np.zeros(768, dtype='float32')
49
+ def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
50
+ """Safe Gemini 1.5 embedding call with fallback."""
51
+ import numpy as np
52
+ try:
53
+ if not text or len(text.strip()) == 0:
54
+ raise ValueError("Empty text cannot be embedded.")
55
+ result = genai.embed_content(
56
+ model="models/text-embedding-004",
57
+ content=text,
58
+ task_type=task_type
59
+ )
60
+ return np.array(result['embedding'], dtype='float32')
61
+ except Exception as e:
62
+ print(f"❌ Embedding error: {e}")
63
+ return np.zeros(768, dtype='float32')
64
+
65
+
66
+ def call_llm_api(prompt, model_name="gemini-2.5-flash-lite"):#'gemini-1.5-flash-latest'):
67
+ """Calls a Google Gemini LLM with the given prompt."""
68
+ try:
69
+ model = genai.GenerativeModel(model_name)
70
+ response = model.generate_content(prompt)
71
+ return response.text, model # Return model instance for token counting
72
+ except Exception as e:
73
+ print(f"Error calling LLM: {e}")
74
+ return "Error: Could not get response from LLM API.", None
75
+
76
+
77
+ # --- Core Document Processing Functions (All previously provided and fixed) ---
78
+
79
+ def read_docx_text(path):
80
+ """
81
+ Reads text and extracts potential table-like strings from a .docx document.
82
+ Separates plain text from structured [ [ ] ] list-like tables.
83
+ Also attempts to extract a document title.
84
+ """
85
+ doc = Document(path)
86
+ plain_text_paragraphs = []
87
+ table_strings = []
88
+ document_title = "Unknown Document Title" # Default
89
+
90
+ # Attempt to extract the document title from the first few paragraphs
91
+ title_paragraphs = [p.text.strip() for p in doc.paragraphs[:5] if p.text.strip()]
92
+ if title_paragraphs:
93
+ # A heuristic to find a title: often the first or second non-empty paragraph
94
+ # or a very long first paragraph if it's the title
95
+ if len(title_paragraphs[0]) > 50 and "Human Genetics" not in title_paragraphs[0]:
96
+ document_title = title_paragraphs[0]
97
+ elif len(title_paragraphs) > 1 and len(title_paragraphs[1]) > 50 and "Human Genetics" not in title_paragraphs[1]:
98
+ document_title = title_paragraphs[1]
99
+ elif any("Complete mitochondrial genomes" in p for p in title_paragraphs):
100
+ # Fallback to a known title phrase if present
101
+ document_title = "Complete mitochondrial genomes of Thai and Lao populations indicate an ancient origin of Austroasiatic groups and demic diffusion in the spread of Tai–Kadai languages"
102
+
103
+ current_table_lines = []
104
+ in_table_parsing_mode = False
105
+
106
+ for p in doc.paragraphs:
107
+ text = p.text.strip()
108
+ if not text:
109
+ continue
110
+
111
+ # Condition to start or continue table parsing
112
+ if text.startswith("## Table "): # Start of a new table section
113
+ if in_table_parsing_mode and current_table_lines:
114
+ table_strings.append("\n".join(current_table_lines))
115
+ current_table_lines = [text] # Include the "## Table X" line
116
+ in_table_parsing_mode = True
117
+ elif in_table_parsing_mode and (text.startswith("[") or text.startswith('"')):
118
+ # Continue collecting lines if we're in table mode and it looks like table data
119
+ # Table data often starts with '[' for lists, or '"' for quoted strings within lists.
120
+ current_table_lines.append(text)
121
+ else:
122
+ # If not in table mode, or if a line doesn't look like table data,
123
+ # then close the current table (if any) and add the line to plain text.
124
+ if in_table_parsing_mode and current_table_lines:
125
+ table_strings.append("\n".join(current_table_lines))
126
+ current_table_lines = []
127
+ in_table_parsing_mode = False
128
+ plain_text_paragraphs.append(text)
129
+
130
+ # After the loop, add any remaining table lines
131
+ if current_table_lines:
132
+ table_strings.append("\n".join(current_table_lines))
133
+
134
+ return "\n".join(plain_text_paragraphs), table_strings, document_title
135
+
136
+ # --- Structured Data Extraction and RAG Functions ---
137
+
138
+ def parse_literal_python_list(table_str):
139
+ list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
140
+ #print("Debug: list_match object (before if check):", list_match)
141
+ if not list_match:
142
+ if "table" in table_str.lower(): # then the table doest have the "]]" at the end
143
+ table_str += "]]"
144
+ list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
145
+ if list_match:
146
+ try:
147
+ matched_string = list_match.group(1)
148
+ #print("Debug: Matched string for literal_eval:", matched_string)
149
+ return ast.literal_eval(matched_string)
150
+ except (ValueError, SyntaxError) as e:
151
+ print(f"Error evaluating literal: {e}")
152
+ return []
153
+ return []
154
+
155
+
156
+ _individual_code_parser = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
157
+ def _parse_individual_code_parts(code_str):
158
+ match = _individual_code_parser.search(code_str)
159
+ if match:
160
+ return match.group(1), match.group(2)
161
+ return None, None
162
+
163
+
164
+ def parse_sample_id_to_population_code(plain_text_content):
165
+ sample_id_map = {}
166
+ contiguous_ranges_data = defaultdict(list)
167
+
168
+ #section_start_marker = "The sample identification of each population is as follows:"
169
+ section_start_marker = ["The sample identification of each population is as follows:","## table"]
170
+
171
+ for s in section_start_marker:
172
+ relevant_text_search = re.search(
173
+ re.escape(s.lower()) + r"\s*(.*?)(?=\n##|\Z)",
174
+ plain_text_content.lower(),
175
+ re.DOTALL
176
+ )
177
+ if relevant_text_search:
178
+ break
179
+
180
+ if not relevant_text_search:
181
+ print("Warning: 'Sample ID Population Code' section start marker not found or block empty.")
182
+ return sample_id_map, contiguous_ranges_data
183
+
184
+ relevant_text_block = relevant_text_search.group(1).strip()
185
+
186
+ # print(f"\nDEBUG_PARSING: --- Start of relevant_text_block (first 500 chars) ---")
187
+ # print(relevant_text_block[:500])
188
+ # print(f"DEBUG_PARSING: --- End of relevant_text_block (last 500 chars) ---")
189
+ # print(relevant_text_block[-500:])
190
+ # print(f"DEBUG_PARSING: Relevant text block length: {len(relevant_text_block)}")
191
+
192
+ mapping_pattern = re.compile(
193
+ r'\b([A-Z0-9]+\d+)(?:-([A-Z0-9]+\d+))?\s+([A-Z0-9]+)\b', # Changed the last group
194
+ re.IGNORECASE)
195
+
196
+ range_expansion_count = 0
197
+ direct_id_count = 0
198
+ total_matches_found = 0
199
+ for match in mapping_pattern.finditer(relevant_text_block):
200
+ total_matches_found += 1
201
+ id1_full_str, id2_full_str_opt, pop_code = match.groups()
202
+
203
+ #print(f" DEBUG_PARSING: Matched: '{match.group(0)}'")
204
+
205
+ pop_code_upper = pop_code.upper()
206
+
207
+ id1_prefix, id1_num_str = _parse_individual_code_parts(id1_full_str)
208
+ if id1_prefix is None:
209
+ #print(f" DEBUG_PARSING: Failed to parse ID1: {id1_full_str}. Skipping this mapping.")
210
+ continue
211
+
212
+ if id2_full_str_opt:
213
+ id2_prefix_opt, id2_num_str_opt = _parse_individual_code_parts(id2_full_str_opt)
214
+ if id2_prefix_opt is None:
215
+ #print(f" DEBUG_PARSING: Failed to parse ID2: {id2_full_str_opt}. Treating {id1_full_str} as single ID1.")
216
+ sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
217
+ direct_id_count += 1
218
+ continue
219
+
220
+ #print(f" DEBUG_PARSING: Comparing prefixes: '{id1_prefix.lower()}' vs '{id2_prefix_opt.lower()}'")
221
+ if id1_prefix.lower() == id2_prefix_opt.lower():
222
+ #print(f" DEBUG_PARSING: ---> Prefixes MATCH for range expansion! Range: {id1_prefix}{id1_num_str}-{id2_prefix_opt}{id2_num_str_opt}")
223
+ try:
224
+ start_num = int(id1_num_str)
225
+ end_num = int(id2_num_str_opt)
226
+ for num in range(start_num, end_num + 1):
227
+ sample_id = f"{id1_prefix.upper()}{num}"
228
+ sample_id_map[sample_id] = pop_code_upper
229
+ range_expansion_count += 1
230
+ contiguous_ranges_data[id1_prefix.upper()].append(
231
+ (start_num, end_num, pop_code_upper)
232
+ )
233
+ except ValueError:
234
+ print(f" DEBUG_PARSING: ValueError in range conversion for {id1_num_str}-{id2_num_str_opt}. Adding endpoints only.")
235
+ sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
236
+ sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
237
+ direct_id_count += 2
238
+ else:
239
+ #print(f" DEBUG_PARSING: Prefixes MISMATCH for range: '{id1_prefix}' vs '{id2_prefix_opt}'. Adding endpoints only.")
240
+ sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
241
+ sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
242
+ direct_id_count += 2
243
+ else:
244
+ sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
245
+ direct_id_count += 1
246
+
247
+ # print(f"DEBUG_PARSING: Total matches found by regex: {total_matches_found}.")
248
+ # print(f"DEBUG_PARSING: Parsed sample IDs: {len(sample_id_map)} total entries.")
249
+ # print(f"DEBUG_PARSING: (including {range_expansion_count} from range expansion and {direct_id_count} direct ID/endpoint entries).")
250
+ return sample_id_map, contiguous_ranges_data
251
+
252
+ country_keywords_regional_overrides = {
253
+ "north thailand": "Thailand", "central thailand": "Thailand",
254
+ "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
255
+ "central india": "India", "east india": "India", "northeast india": "India",
256
+ "south sibera": "Russia", "siberia": "Russia", "yunnan": "China", #"tibet": "China",
257
+ "sumatra": "Indonesia", "borneo": "Indonesia",
258
+ "northern mindanao": "Philippines", "west malaysia": "Malaysia",
259
+ "mongolia": "China",
260
+ "beijing": "China",
261
+ "north laos": "Laos", "central laos": "Laos",
262
+ "east myanmar": "Myanmar", "west myanmar": "Myanmar"}
263
+
264
+ # Updated get_country_from_text function
265
+ def get_country_from_text(text):
266
+ text_lower = text.lower()
267
+
268
+ # 1. Use pycountry for official country names and common aliases
269
+ for country in pycountry.countries:
270
+ # Check full name match first
271
+ if text_lower == country.name.lower():
272
+ return country.name
273
+
274
+ # Safely check for common_name
275
+ if hasattr(country, 'common_name') and text_lower == country.common_name.lower():
276
+ return country.common_name
277
+
278
+ # Safely check for official_name
279
+ if hasattr(country, 'official_name') and text_lower == country.official_name.lower():
280
+ return country.official_name
281
+
282
+ # Check if country name is part of the text (e.g., 'Thailand' in 'Thailand border')
283
+ if country.name.lower() in text_lower:
284
+ return country.name
285
+
286
+ # Safely check if common_name is part of the text
287
+ if hasattr(country, 'common_name') and country.common_name.lower() in text_lower:
288
+ return country.common_name
289
+ # 2. Prioritize specific regional overrides
290
+ for keyword, country in country_keywords_regional_overrides.items():
291
+ if keyword in text_lower:
292
+ return country
293
+ # 3. Check for broader regions that you want to map to "unknown" or a specific country
294
+ if "north asia" in text_lower or "southeast asia" in text_lower or "east asia" in text_lower:
295
+ return "unknown"
296
+
297
+ return "unknown"
298
+
299
+ # Get the list of English stop words from NLTK
300
+ non_meaningful_pop_names = set(stopwords.words('english'))
301
+
302
+ def parse_population_code_to_country(plain_text_content, table_strings):
303
+ pop_code_country_map = {}
304
+ pop_code_ethnicity_map = {} # NEW: To store ethnicity for structured lookup
305
+ pop_code_specific_loc_map = {} # NEW: To store specific location for structured lookup
306
+
307
+ # Regex for parsing population info in structured lists and general text
308
+ # This pattern captures: (Pop Name/Ethnicity) (Pop Code) (Region/Specific Location) (Country) (Linguistic Family)
309
+ # The 'Pop Name/Ethnicity' (Group 1) is often the ethnicity
310
+ pop_info_pattern = re.compile(
311
+ r'([A-Za-z\s]+?)\s+([A-Z]+\d*)\s+' # Pop Name (Group 1), Pop Code (Group 2) - Changed \d+ to \d* for codes like 'SH'
312
+ r'([A-Za-z\s\(\)\-,\/]+?)\s+' # Region/Specific Location (Group 3)
313
+ r'(North+|South+|West+|East+|Thailand|Laos|Cambodia|Myanmar|Philippines|Indonesia|Malaysia|China|India|Taiwan|Vietnam|Russia|Nepal|Japan|South Korea)\b' # Country (Group 4)
314
+ r'(?:.*?([A-Za-z\s\-]+))?\s*' # Optional Linguistic Family (Group 5), made optional with ?, followed by optional space
315
+ r'(\d+(?:\s+\d+\.?\d*)*)?', # Match all the numbers (Group 6) - made optional
316
+ re.IGNORECASE
317
+ )
318
+ for table_str in table_strings:
319
+ table_data = parse_literal_python_list(table_str)
320
+ if table_data:
321
+ is_list_of_lists = bool(table_data) and isinstance(table_data[0], list)
322
+ if is_list_of_lists:
323
+ for row_idx, row in enumerate(table_data):
324
+ row_text = " ".join(map(str, row))
325
+ match = pop_info_pattern.search(row_text)
326
+ if match:
327
+ pop_name = match.group(1).strip()
328
+ pop_code = match.group(2).upper()
329
+ specific_loc_text = match.group(3).strip()
330
+ country_text = match.group(4).strip()
331
+ linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
332
+
333
+ final_country = get_country_from_text(country_text)
334
+ if final_country == 'unknown': # Try specific loc text for country if direct country is not found
335
+ final_country = get_country_from_text(specific_loc_text)
336
+
337
+ if pop_code:
338
+ pop_code_country_map[pop_code] = final_country
339
+
340
+ # Populate ethnicity map (often Pop Name is ethnicity)
341
+ pop_code_ethnicity_map[pop_code] = pop_name
342
+
343
+ # Populate specific location map
344
+ pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
345
+ else:
346
+ row_text = " ".join(map(str, table_data))
347
+ match = pop_info_pattern.search(row_text)
348
+ if match:
349
+ pop_name = match.group(1).strip()
350
+ pop_code = match.group(2).upper()
351
+ specific_loc_text = match.group(3).strip()
352
+ country_text = match.group(4).strip()
353
+ linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
354
+
355
+ final_country = get_country_from_text(country_text)
356
+ if final_country == 'unknown': # Try specific loc text for country if direct country is not found
357
+ final_country = get_country_from_text(specific_loc_text)
358
+
359
+ if pop_code:
360
+ pop_code_country_map[pop_code] = final_country
361
+
362
+ # Populate ethnicity map (often Pop Name is ethnicity)
363
+ pop_code_ethnicity_map[pop_code] = pop_name
364
+
365
+ # Populate specific location map
366
+ pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
367
+
368
+ # # Special case refinements for ethnicity/location if more specific rules are known from document:
369
+ # if pop_name.lower() == "khon mueang": # and specific conditions if needed
370
+ # pop_code_ethnicity_map[pop_code] = "Khon Mueang"
371
+ # # If Khon Mueang has a specific city/district, add here
372
+ # # e.g., if 'Chiang Mai' is directly linked to KM1 in a specific table
373
+ # # pop_code_specific_loc_map[pop_code] = "Chiang Mai"
374
+ # elif pop_name.lower() == "lawa":
375
+ # pop_code_ethnicity_map[pop_code] = "Lawa"
376
+ # # Add similar specific rules for other populations (e.g., Mon for MO1, MO2, MO3)
377
+ # elif pop_name.lower() == "mon":
378
+ # pop_code_ethnicity_map[pop_code] = "Mon"
379
+ # # For MO2: "West Thailand (Thailand Myanmar border)" -> no city
380
+ # # For MO3: "East Myanmar (Thailand Myanmar border)" -> no city
381
+ # # If the doc gives "Bangkok" for MO4, add it here for MO4's actual specific_location.
382
+ # # etc.
383
+
384
+ # Fallback to parsing general plain text content (sentences)
385
+ sentences = data_preprocess.extract_sentences(plain_text_content)
386
+ for s in sentences: # Still focusing on just this one sentence
387
+ # Use re.finditer to get all matches
388
+ matches = pop_info_pattern.finditer(s)
389
+ pop_name, pop_code, specific_loc_text, country_text = "unknown", "unknown", "unknown", "unknown"
390
+ for match in matches:
391
+ if match.group(1):
392
+ pop_name = match.group(1).strip()
393
+ if match.group(2):
394
+ pop_code = match.group(2).upper()
395
+ if match.group(3):
396
+ specific_loc_text = match.group(3).strip()
397
+ if match.group(4):
398
+ country_text = match.group(4).strip()
399
+ # linguistic_family = match.group(5).strip() if match.group(5) else 'unknown' # Already captured by pop_info_pattern
400
+
401
+ final_country = get_country_from_text(country_text)
402
+ if final_country == 'unknown':
403
+ final_country = get_country_from_text(specific_loc_text)
404
+
405
+ if pop_code.lower() not in non_meaningful_pop_names:
406
+ if final_country.lower() not in non_meaningful_pop_names:
407
+ pop_code_country_map[pop_code] = final_country
408
+ if pop_name.lower() not in non_meaningful_pop_names:
409
+ pop_code_ethnicity_map[pop_code] = pop_name # Default ethnicity from Pop Name
410
+ if specific_loc_text.lower() not in non_meaningful_pop_names:
411
+ pop_code_specific_loc_map[pop_code] = specific_loc_text
412
+
413
+ # Specific rules for ethnicity/location in plain text:
414
+ if pop_name.lower() == "khon mueang":
415
+ pop_code_ethnicity_map[pop_code] = "Khon Mueang"
416
+ elif pop_name.lower() == "lawa":
417
+ pop_code_ethnicity_map[pop_code] = "Lawa"
418
+ elif pop_name.lower() == "mon":
419
+ pop_code_ethnicity_map[pop_code] = "Mon"
420
+ elif pop_name.lower() == "seak": # Added specific rule for Seak
421
+ pop_code_ethnicity_map[pop_code] = "Seak"
422
+ elif pop_name.lower() == "nyaw": # Added specific rule for Nyaw
423
+ pop_code_ethnicity_map[pop_code] = "Nyaw"
424
+ elif pop_name.lower() == "nyahkur": # Added specific rule for Nyahkur
425
+ pop_code_ethnicity_map[pop_code] = "Nyahkur"
426
+ elif pop_name.lower() == "suay": # Added specific rule for Suay
427
+ pop_code_ethnicity_map[pop_code] = "Suay"
428
+ elif pop_name.lower() == "soa": # Added specific rule for Soa
429
+ pop_code_ethnicity_map[pop_code] = "Soa"
430
+ elif pop_name.lower() == "bru": # Added specific rule for Bru
431
+ pop_code_ethnicity_map[pop_code] = "Bru"
432
+ elif pop_name.lower() == "khamu": # Added specific rule for Khamu
433
+ pop_code_ethnicity_map[pop_code] = "Khamu"
434
+
435
+ return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map
436
+
437
+ def general_parse_population_code_to_country(plain_text_content, table_strings):
438
+ pop_code_country_map = {}
439
+ pop_code_ethnicity_map = {}
440
+ pop_code_specific_loc_map = {}
441
+ sample_id_to_pop_code = {}
442
+
443
+ for table_str in table_strings:
444
+ table_data = parse_literal_python_list(table_str)
445
+ if not table_data or not isinstance(table_data[0], list):
446
+ continue
447
+
448
+ header_row = [col.lower() for col in table_data[0]]
449
+ header_map = {col: idx for idx, col in enumerate(header_row)}
450
+
451
+ # MJ17: Direct PopCode → Country
452
+ if 'id' in header_map and 'country' in header_map:
453
+ for row in table_strings[1:]:
454
+ row = parse_literal_python_list(row)[0]
455
+ if len(row) < len(header_row):
456
+ continue
457
+ pop_code = str(row[header_map['id']]).strip()
458
+ country = str(row[header_map['country']]).strip()
459
+ province = row[header_map['province']].strip() if 'province' in header_map else 'unknown'
460
+ pop_group = row[header_map['population group / region']].strip() if 'population group / region' in header_map else 'unknown'
461
+ pop_code_country_map[pop_code] = country
462
+ pop_code_specific_loc_map[pop_code] = province
463
+ pop_code_ethnicity_map[pop_code] = pop_group
464
+
465
+ # A1YU101 or EBK/KSK: SampleID → PopCode
466
+ elif 'sample id' in header_map and 'population code' in header_map:
467
+ for row in table_strings[1:]:
468
+ row = parse_literal_python_list(row)[0]
469
+ if len(row) < 2:
470
+ continue
471
+ sample_id = row[header_map['sample id']].strip().upper()
472
+ pop_code = row[header_map['population code']].strip().upper()
473
+ sample_id_to_pop_code[sample_id] = pop_code
474
+
475
+ # PopCode → Country (A1YU101/EBK mapping)
476
+ elif 'population code' in header_map and 'country' in header_map:
477
+ for row in table_strings[1:]:
478
+ row = parse_literal_python_list(row)[0]
479
+ if len(row) < 2:
480
+ continue
481
+ pop_code = row[header_map['population code']].strip().upper()
482
+ country = row[header_map['country']].strip()
483
+ pop_code_country_map[pop_code] = country
484
+
485
+ return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map, sample_id_to_pop_code
486
+
487
+ def chunk_text(text, chunk_size=500, overlap=50):
488
+ """Splits text into chunks (by words) with overlap."""
489
+ chunks = []
490
+ words = text.split()
491
+ num_words = len(words)
492
+
493
+ start = 0
494
+ while start < num_words:
495
+ end = min(start + chunk_size, num_words)
496
+ chunk = " ".join(words[start:end])
497
+ chunks.append(chunk)
498
+
499
+ if end == num_words:
500
+ break
501
+ start += chunk_size - overlap # Move start by (chunk_size - overlap)
502
+ return chunks
503
+
504
+ def build_vector_index_and_data(doc_path, index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
505
+ """
506
+ Reads document, builds structured lookup, chunks remaining text, embeds chunks,
507
+ and builds/saves a FAISS index.
508
+ """
509
+ print("Step 1: Reading document and extracting structured data...")
510
+ # plain_text_content, table_strings, document_title = read_docx_text(doc_path) # Get document_title here
511
+
512
+ # sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
513
+ # pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
514
+
515
+ # master_structured_lookup = {}
516
+ # master_structured_lookup['document_title'] = document_title # Store document title
517
+ # master_structured_lookup['sample_id_map'] = sample_id_map
518
+ # master_structured_lookup['contiguous_ranges'] = dict(contiguous_ranges_data)
519
+ # master_structured_lookup['pop_code_to_country'] = pop_code_to_country
520
+ # master_structured_lookup['pop_code_to_ethnicity'] = pop_code_to_ethnicity # NEW: Store pop_code to ethnicity map
521
+ # master_structured_lookup['pop_code_to_specific_loc'] = pop_code_to_specific_loc # NEW: Store pop_code to specific_loc map
522
+
523
+
524
+ # # Final consolidation: Use sample_id_map to derive full info for queries
525
+ # final_structured_entries = {}
526
+ # for sample_id, pop_code in master_structured_lookup['sample_id_map'].items():
527
+ # country = master_structured_lookup['pop_code_to_country'].get(pop_code, 'unknown')
528
+ # ethnicity = master_structured_lookup['pop_code_to_ethnicity'].get(pop_code, 'unknown') # Retrieve ethnicity
529
+ # specific_location = master_structured_lookup['pop_code_to_specific_loc'].get(pop_code, 'unknown') # Retrieve specific location
530
+
531
+ # final_structured_entries[sample_id] = {
532
+ # 'population_code': pop_code,
533
+ # 'country': country,
534
+ # 'type': 'modern',
535
+ # 'ethnicity': ethnicity, # Store ethnicity
536
+ # 'specific_location': specific_location # Store specific location
537
+ # }
538
+ # master_structured_lookup['final_structured_entries'] = final_structured_entries
539
+ plain_text_content, table_strings, document_title = read_docx_text(doc_path)
540
+ pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc, sample_id_map = general_parse_population_code_to_country(plain_text_content, table_strings)
541
+
542
+ final_structured_entries = {}
543
+ if sample_id_map:
544
+ for sample_id, pop_code in sample_id_map.items():
545
+ country = pop_code_to_country.get(pop_code, 'unknown')
546
+ ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
547
+ specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
548
+ final_structured_entries[sample_id] = {
549
+ 'population_code': pop_code,
550
+ 'country': country,
551
+ 'type': 'modern',
552
+ 'ethnicity': ethnicity,
553
+ 'specific_location': specific_loc
554
+ }
555
+ else:
556
+ for pop_code in pop_code_to_country.keys():
557
+ country = pop_code_to_country.get(pop_code, 'unknown')
558
+ ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
559
+ specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
560
+ final_structured_entries[pop_code] = {
561
+ 'population_code': pop_code,
562
+ 'country': country,
563
+ 'type': 'modern',
564
+ 'ethnicity': ethnicity,
565
+ 'specific_location': specific_loc
566
+ }
567
+ if not final_structured_entries:
568
+ # traditional way of A1YU101
569
+ sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
570
+ pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
571
+ if sample_id_map:
572
+ for sample_id, pop_code in sample_id_map.items():
573
+ country = pop_code_to_country.get(pop_code, 'unknown')
574
+ ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
575
+ specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
576
+ final_structured_entries[sample_id] = {
577
+ 'population_code': pop_code,
578
+ 'country': country,
579
+ 'type': 'modern',
580
+ 'ethnicity': ethnicity,
581
+ 'specific_location': specific_loc
582
+ }
583
+ else:
584
+ for pop_code in pop_code_to_country.keys():
585
+ country = pop_code_to_country.get(pop_code, 'unknown')
586
+ ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
587
+ specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
588
+ final_structured_entries[pop_code] = {
589
+ 'population_code': pop_code,
590
+ 'country': country,
591
+ 'type': 'modern',
592
+ 'ethnicity': ethnicity,
593
+ 'specific_location': specific_loc
594
+ }
595
+
596
+ master_lookup = {
597
+ 'document_title': document_title,
598
+ 'pop_code_to_country': pop_code_to_country,
599
+ 'pop_code_to_ethnicity': pop_code_to_ethnicity,
600
+ 'pop_code_to_specific_loc': pop_code_to_specific_loc,
601
+ 'sample_id_map': sample_id_map,
602
+ 'final_structured_entries': final_structured_entries
603
+ }
604
+ print(f"Structured lookup built with {len(final_structured_entries)} entries in 'final_structured_entries'.")
605
+
606
+ with open(structured_path, 'w') as f:
607
+ json.dump(master_lookup, f, indent=4)
608
+ print(f"Structured lookup saved to {structured_path}.")
609
+
610
+ print("Step 2: Chunking document for RAG vector index...")
611
+ # replace the chunk here with the all_output from process_inputToken and fallback to this traditional chunk
612
+ clean_text, clean_table = "", ""
613
+ if plain_text_content:
614
+ clean_text = data_preprocess.normalize_for_overlap(plain_text_content)
615
+ if table_strings:
616
+ clean_table = data_preprocess.normalize_for_overlap(". ".join(table_strings))
617
+ all_clean_chunk = clean_text + clean_table
618
+ document_chunks = chunk_text(all_clean_chunk)
619
+ print(f"Document chunked into {len(document_chunks)} chunks.")
620
+
621
+ print("Step 3: Generating embeddings for chunks (this might take time and cost API calls)...")
622
+
623
+ embedding_model_for_chunks = genai.GenerativeModel('models/text-embedding-004')
624
+
625
+ chunk_embeddings = []
626
+ for i, chunk in enumerate(document_chunks):
627
+ embedding = get_embedding(chunk, task_type="RETRIEVAL_DOCUMENT")
628
+ if embedding is not None and embedding.shape[0] > 0:
629
+ chunk_embeddings.append(embedding)
630
+ else:
631
+ print(f"Warning: Failed to get valid embedding for chunk {i}. Skipping.")
632
+ chunk_embeddings.append(np.zeros(768, dtype='float32'))
633
+
634
+ if not chunk_embeddings:
635
+ raise ValueError("No valid embeddings generated. Check get_embedding function and API.")
636
+
637
+ embedding_dimension = chunk_embeddings[0].shape[0]
638
+ index = faiss.IndexFlatL2(embedding_dimension)
639
+ index.add(np.array(chunk_embeddings))
640
+
641
+ faiss.write_index(index, index_path)
642
+ with open(chunks_path, "w") as f:
643
+ json.dump(document_chunks, f)
644
+
645
+ print(f"FAISS index built and saved to {index_path}.")
646
+ print(f"Document chunks saved to {chunks_path}.")
647
+ return master_lookup, index, document_chunks, all_clean_chunk
648
+
649
+
650
+ def load_rag_assets(index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
651
+ """Loads pre-built RAG assets (FAISS index, chunks, structured lookup)."""
652
+ print("Loading RAG assets...")
653
+ master_structured_lookup = {}
654
+ if os.path.exists(structured_path):
655
+ with open(structured_path, 'r') as f:
656
+ master_structured_lookup = json.load(f)
657
+ print("Structured lookup loaded.")
658
+ else:
659
+ print("Structured lookup file not found. Rebuilding is likely needed.")
660
+
661
+ index = None
662
+ chunks = []
663
+ if os.path.exists(index_path) and os.path.exists(chunks_path):
664
+ try:
665
+ index = faiss.read_index(index_path)
666
+ with open(chunks_path, "r") as f:
667
+ chunks = json.load(f)
668
+ print("FAISS index and chunks loaded.")
669
+ except Exception as e:
670
+ print(f"Error loading FAISS index or chunks: {e}. Will rebuild.")
671
+ index = None
672
+ chunks = []
673
+ else:
674
+ print("FAISS index or chunks files not found.")
675
+
676
+ return master_structured_lookup, index, chunks
677
+ # Helper function for query_document_info
678
+ def exactInContext(text, keyword):
679
+ # try keyword_prfix
680
+ # code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
681
+ # # Attempt to parse the keyword into its prefix and numerical part using re.search
682
+ # keyword_match = code_pattern.search(keyword)
683
+ # keyword_prefix = None
684
+ # keyword_num = None
685
+ # if keyword_match:
686
+ # keyword_prefix = keyword_match.group(1).lower()
687
+ # keyword_num = int(keyword_match.group(2))
688
+ text = text.lower()
689
+ idx = text.find(keyword.lower())
690
+ if idx == -1:
691
+ # if keyword_prefix:
692
+ # idx = text.find(keyword_prefix)
693
+ # if idx == -1:
694
+ # return False
695
+ return False
696
+ return True
697
+ def chooseContextLLM(contexts, kw):
698
+ # if kw in context
699
+ for con in contexts:
700
+ context = contexts[con]
701
+ if context:
702
+ if exactInContext(context, kw):
703
+ return con, context
704
+ #if cannot find anything related to kw in context, return all output
705
+ if contexts["all_output"]:
706
+ return "all_output", contexts["all_output"]
707
+ else:
708
+ # if all_output not exist
709
+ # look of chunk and still not exist return document chunk
710
+ if contexts["chunk"]: return "chunk", contexts["chunk"]
711
+ elif contexts["document_chunk"]: return "document_chunk", contexts["document_chunk"]
712
+ else: return None, None
713
+ def clean_llm_output(llm_response_text, output_format_str):
714
+ results = []
715
+ lines = llm_response_text.strip().split('\n')
716
+ output_country, output_type, output_ethnicity, output_specific_location = [],[],[],[]
717
+ for line in lines:
718
+ extracted_country, extracted_type, extracted_ethnicity, extracted_specific_location = "unknown", "unknown", "unknown", "unknown"
719
+ line = line.strip()
720
+ if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
721
+ parsed_output = re.search(r'^\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
722
+ if parsed_output:
723
+ extracted_ethnicity = parsed_output.group(1).strip()
724
+ extracted_specific_location = parsed_output.group(2).strip()
725
+ else:
726
+ print(" DEBUG: LLM did not follow expected 2-field format for targeted RAG. Defaulting to unknown for ethnicity/specific_location.")
727
+ extracted_ethnicity = 'unknown'
728
+ extracted_specific_location = 'unknown'
729
+ elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
730
+ parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
731
+ if parsed_output:
732
+ extracted_type = parsed_output.group(1).strip()
733
+ extracted_ethnicity = parsed_output.group(2).strip()
734
+ extracted_specific_location = parsed_output.group(3).strip()
735
+ else:
736
+ # Fallback: check if only 2 fields
737
+ parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', llm_response_text)
738
+ if parsed_output_2_fields:
739
+ extracted_type = parsed_output_2_fields.group(1).strip()
740
+ extracted_ethnicity = parsed_output_2_fields.group(2).strip()
741
+ extracted_specific_location = 'unknown'
742
+ else:
743
+ # even simpler fallback: 1 field only
744
+ parsed_output_1_field = re.search(r'^\s*([^,]+?)\s*$', llm_response_text)
745
+ if parsed_output_1_field:
746
+ extracted_type = parsed_output_1_field.group(1).strip()
747
+ extracted_ethnicity = 'unknown'
748
+ extracted_specific_location = 'unknown'
749
+ else:
750
+ print(" DEBUG: LLM did not follow any expected simplified format. Attempting verbose parsing fallback.")
751
+ type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', llm_response_text)
752
+ extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
753
+ extracted_ethnicity = 'unknown'
754
+ extracted_specific_location = 'unknown'
755
+ else:
756
+ parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', line)
757
+ if parsed_output:
758
+ extracted_country = parsed_output.group(1).strip()
759
+ extracted_type = parsed_output.group(2).strip()
760
+ extracted_ethnicity = parsed_output.group(3).strip()
761
+ extracted_specific_location = parsed_output.group(4).strip()
762
+ else:
763
+ print(f" DEBUG: Line did not follow expected 4-field format: {line}")
764
+ parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', line)
765
+ if parsed_output_2_fields:
766
+ extracted_country = parsed_output_2_fields.group(1).strip()
767
+ extracted_type = parsed_output_2_fields.group(2).strip()
768
+ extracted_ethnicity = 'unknown'
769
+ extracted_specific_location = 'unknown'
770
+ else:
771
+ print(f" DEBUG: Fallback to verbose-style parsing: {line}")
772
+ country_match_fallback = re.search(r'Country:\s*([A-Za-z\s-]+)', line)
773
+ type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', line)
774
+ extracted_country = country_match_fallback.group(1).strip() if country_match_fallback else 'unknown'
775
+ extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
776
+ extracted_ethnicity = 'unknown'
777
+ extracted_specific_location = 'unknown'
778
+
779
+ results.append({
780
+ "country": extracted_country,
781
+ "type": extracted_type,
782
+ "ethnicity": extracted_ethnicity,
783
+ "specific_location": extracted_specific_location
784
+ #"country_explain":extracted_country_explain,
785
+ #"type_explain": extracted_type_explain
786
+ })
787
+ # if more than 2 results
788
+ if output_format_str == "ethnicity, specific_location/unknown":
789
+ for result in results:
790
+ if result["ethnicity"] not in output_ethnicity:
791
+ output_ethnicity.append(result["ethnicity"])
792
+ if result["specific_location"] not in output_specific_location:
793
+ output_specific_location.append(result["specific_location"])
794
+ return " or ".join(output_ethnicity), " or ".join(output_specific_location)
795
+ elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
796
+ for result in results:
797
+ if result["type"] not in output_type:
798
+ output_type.append(result["type"])
799
+ if result["ethnicity"] not in output_ethnicity:
800
+ output_ethnicity.append(result["ethnicity"])
801
+ if result["specific_location"] not in output_specific_location:
802
+ output_specific_location.append(result["specific_location"])
803
+
804
+ return " or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)
805
+ else:
806
+ for result in results:
807
+ if result["country"] not in output_country:
808
+ output_country.append(result["country"])
809
+ if result["type"] not in output_type:
810
+ output_type.append(result["type"])
811
+ if result["ethnicity"] not in output_ethnicity:
812
+ output_ethnicity.append(result["ethnicity"])
813
+ if result["specific_location"] not in output_specific_location:
814
+ output_specific_location.append(result["specific_location"])
815
+ return " or ".join(output_country)," or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)
816
+
817
+ # def parse_multi_sample_llm_output(raw_response: str, output_format_str):
818
+ # """
819
+ # Parse LLM output with possibly multiple metadata lines + shared explanations.
820
+ # """
821
+ # lines = [line.strip() for line in raw_response.strip().splitlines() if line.strip()]
822
+ # metadata_list = []
823
+ # explanation_lines = []
824
+ # if output_format_str == "country_name, modern/ancient/unknown":
825
+ # parts = [x.strip() for x in lines[0].split(",")]
826
+ # if len(parts)==2:
827
+ # metadata_list.append({
828
+ # "country": parts[0],
829
+ # "sample_type": parts[1]#,
830
+ # #"ethnicity": parts[2],
831
+ # #"location": parts[3]
832
+ # })
833
+ # if 1<len(lines):
834
+ # line = lines[1]
835
+ # if "\n" in line: line = line.split("\n")
836
+ # if ". " in line: line = line.split(". ")
837
+ # if isinstance(line,str): line = [line]
838
+ # explanation_lines += line
839
+ # elif output_format_str == "modern/ancient/unknown":
840
+ # metadata_list.append({
841
+ # "country": "unknown",
842
+ # "sample_type": lines[0]#,
843
+ # #"ethnicity": parts[2],
844
+ # #"location": parts[3]
845
+ # })
846
+ # explanation_lines.append(lines[1])
847
+
848
+ # # Assign explanations (optional) to each sample — same explanation reused
849
+ # for md in metadata_list:
850
+ # md["country_explanation"] = None
851
+ # md["sample_type_explanation"] = None
852
+
853
+ # if md["country"].lower() != "unknown" and len(explanation_lines) >= 1:
854
+ # md["country_explanation"] = explanation_lines[0]
855
+
856
+ # if md["sample_type"].lower() != "unknown":
857
+ # if len(explanation_lines) >= 2:
858
+ # md["sample_type_explanation"] = explanation_lines[1]
859
+ # elif len(explanation_lines) == 1 and md["country"].lower() == "unknown":
860
+ # md["sample_type_explanation"] = explanation_lines[0]
861
+ # elif len(explanation_lines) == 1:
862
+ # md["sample_type_explanation"] = explanation_lines[0]
863
+ # return metadata_list
864
+
865
+ def parse_multi_sample_llm_output(raw_response: str, output_format_str):
866
+ """
867
+ Parse LLM output with possibly multiple metadata lines + shared explanations.
868
+ """
869
+ metadata_list = {}
870
+ explanation_lines = []
871
+ output_answers = raw_response.split("\n")[0].split(", ")
872
+ explanation_lines = [x for x in raw_response.split("\n")[1:] if x.strip()]
873
+ print("raw explanation line which split by new line: ", explanation_lines)
874
+ if len(explanation_lines) == 1:
875
+ if len(explanation_lines[0].split(". ")) > len(explanation_lines):
876
+ explanation_lines = [x for x in explanation_lines[0].split(". ") if x.strip()]
877
+ print("explain line split by dot: ", explanation_lines)
878
+ output_formats = output_format_str.split(", ")
879
+ explain = ""
880
+ # assign output format to its output answer and explanation
881
+ if output_format_str:
882
+ outputs = output_format_str.split(", ")
883
+ for o in range(len(outputs)):
884
+ output = outputs[o]
885
+ metadata_list[output] = {"answer":"",
886
+ output+"_explanation":""}
887
+ # assign output answers
888
+ if o < len(output_answers):
889
+ # check if output_format unexpectedly in the answer such as:
890
+ #country_name: Europe, modern/ancient: modern
891
+ try:
892
+ if ": " in output_answers[o]:
893
+ output_answers[o] = output_answers[o].split(": ")[1]
894
+ except:
895
+ pass
896
+ # Europe, modern
897
+ metadata_list[output]["answer"] = output_answers[o]
898
+ if "unknown" in metadata_list[output]["answer"].lower():
899
+ metadata_list[output]["answer"] = "unknown"
900
+ else:
901
+ metadata_list[output]["answer"] = "unknown"
902
+ # assign explanations
903
+ if metadata_list[output]["answer"] != "unknown":
904
+ if explanation_lines:
905
+ explain = explanation_lines.pop(0)
906
+ metadata_list[output][output+"_explanation"] = explain
907
+ else:
908
+ metadata_list[output][output+"_explanation"] = "unknown"
909
+ return metadata_list
910
+
911
+ def merge_metadata_outputs(metadata_list):
912
+ """
913
+ Merge a list of metadata dicts into one, combining differing values with 'or'.
914
+ Assumes all dicts have the same keys.
915
+ """
916
+ if not metadata_list:
917
+ return {}
918
+
919
+ merged = {}
920
+ keys = metadata_list[0].keys()
921
+
922
+ for key in keys:
923
+ values = [md[key] for md in metadata_list if key in md]
924
+ unique_values = list(dict.fromkeys(values)) # preserve order, remove dupes
925
+ if "unknown" in unique_values:
926
+ unique_values.pop(unique_values.index("unknown"))
927
+ if len(unique_values) == 1:
928
+ merged[key] = unique_values[0]
929
+ else:
930
+ merged[key] = " or ".join(unique_values)
931
+
932
+ return merged
933
+
934
+
935
+ def query_document_info(query_word, alternative_query_word, metadata, master_structured_lookup, faiss_index, document_chunks, llm_api_function, chunk=None, all_output=None, model_ai=None):
936
+ """
937
+ Queries the document using a hybrid approach:
938
+ 1. Local structured lookup (fast, cheap, accurate for known patterns).
939
+ 2. RAG with semantic search and LLM (general, flexible, cost-optimized).
940
+ """
941
+ if model_ai:
942
+ if model_ai == "gemini-1.5-flash-latest":
943
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
944
+ PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
945
+ PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
946
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
947
+ global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-1.5-flash-latest")#('gemini-1.5-flash-latest')
948
+ else:
949
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
950
+ # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
951
+ PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
952
+ PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
953
+
954
+ # Embedding-001 pricing per 1,000 input tokens
955
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
956
+ global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
957
+
958
+ if metadata:
959
+ extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = metadata["country"], metadata["specific_location"], metadata["ethnicity"], metadata["sample_type"]
960
+ extracted_col_date, extracted_iso, extracted_title, extracted_features = metadata["collection_date"], metadata["isolate"], metadata["title"], metadata["all_features"]
961
+ else:
962
+ extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = "unknown", "unknown", "unknown", "unknown"
963
+ extracted_col_date, extracted_iso, extracted_title = "unknown", "unknown", "unknown"
964
+ # --- NEW: Pre-process alternative_query_word to remove '.X' suffix if present ---
965
+ if alternative_query_word:
966
+ alternative_query_word_cleaned = alternative_query_word.split('.')[0]
967
+ else:
968
+ alternative_query_word_cleaned = alternative_query_word
969
+ country_explanation, sample_type_explanation = None, None
970
+
971
+ # Use the consolidated final_structured_entries for direct lookup
972
+ final_structured_entries = master_structured_lookup.get('final_structured_entries', {})
973
+ document_title = master_structured_lookup.get('document_title', 'Unknown Document Title') # Retrieve document title
974
+
975
+ # Default values for all extracted fields. These will be updated.
976
+ method_used = 'unknown' # Will be updated based on the method that yields a result
977
+ population_code_from_sl = 'unknown' # To pass to RAG prompt if available
978
+ total_query_cost = 0
979
+ # Attempt 1: Try primary query_word (e.g., isolate name) with structured lookup
980
+ try:
981
+ print("try attempt 1 in model query")
982
+ structured_info = final_structured_entries.get(query_word.upper())
983
+ if structured_info:
984
+ if extracted_country == 'unknown':
985
+ extracted_country = structured_info['country']
986
+ if extracted_type == 'unknown':
987
+ extracted_type = structured_info['type']
988
+
989
+ # if extracted_ethnicity == 'unknown':
990
+ # extracted_ethnicity = structured_info.get('ethnicity', 'unknown') # Get ethnicity from structured lookup
991
+ # if extracted_specific_location == 'unknown':
992
+ # extracted_specific_location = structured_info.get('specific_location', 'unknown') # Get specific_location from structured lookup
993
+ population_code_from_sl = structured_info['population_code']
994
+ method_used = "structured_lookup_direct"
995
+ print(f"'{query_word}' found in structured lookup (direct match).")
996
+ except:
997
+ print("pass attempt 1 in model query")
998
+ pass
999
+ # Attempt 2: Try primary query_word with heuristic range lookup if direct fails (only if not already resolved)
1000
+ try:
1001
+ print("try attempt 2 in model query")
1002
+ if method_used == 'unknown':
1003
+ query_prefix, query_num_str = _parse_individual_code_parts(query_word)
1004
+ if query_prefix is not None and query_num_str is not None:
1005
+ try: query_num = int(query_num_str)
1006
+ except ValueError: query_num = None
1007
+ if query_num is not None:
1008
+ query_prefix_upper = query_prefix.upper()
1009
+ contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
1010
+ pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
1011
+ pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
1012
+ pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
1013
+
1014
+ if query_prefix_upper in contiguous_ranges:
1015
+ for start_num, end_num, pop_code_for_range in contiguous_ranges[query_prefix_upper]:
1016
+ if start_num <= query_num <= end_num:
1017
+ country_from_heuristic = pop_code_to_country.get(pop_code_for_range, 'unknown')
1018
+ if country_from_heuristic != 'unknown':
1019
+ if extracted_country == 'unknown':
1020
+ extracted_country = country_from_heuristic
1021
+ if extracted_type == 'unknown':
1022
+ extracted_type = 'modern'
1023
+ # if extracted_ethnicity == 'unknown':
1024
+ # extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
1025
+ # if extracted_specific_location == 'unknown':
1026
+ # extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
1027
+ population_code_from_sl = pop_code_for_range
1028
+ method_used = "structured_lookup_heuristic_range_match"
1029
+ print(f"'{query_word}' not direct. Heuristic: Falls within range {query_prefix_upper}{start_num}-{query_prefix_upper}{end_num}.")
1030
+ break
1031
+ else:
1032
+ print(f"'{query_word}' heuristic match found, but country unknown. Will fall to RAG below.")
1033
+ except:
1034
+ print("pass attempt 2 in model query")
1035
+ pass
1036
+ # Attempt 3: If primary query_word failed all structured lookups, try alternative_query_word (cleaned)
1037
+ try:
1038
+ print("try attempt 3 in model query")
1039
+ if method_used == 'unknown' and alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
1040
+ print(f"'{query_word}' not found in structured (or heuristic). Trying alternative '{alternative_query_word_cleaned}'.")
1041
+
1042
+ # Try direct lookup for alternative word
1043
+ structured_info_alt = final_structured_entries.get(alternative_query_word_cleaned.upper())
1044
+ if structured_info_alt:
1045
+ if extracted_country == 'unknown':
1046
+ extracted_country = structured_info_alt['country']
1047
+ if extracted_type == 'unknown':
1048
+ extracted_type = structured_info_alt['type']
1049
+ # if extracted_ethnicity == 'unknown':
1050
+ # extracted_ethnicity = structured_info_alt.get('ethnicity', 'unknown')
1051
+ # if extracted_specific_location == 'unknown':
1052
+ # extracted_specific_location = structured_info_alt.get('specific_location', 'unknown')
1053
+ population_code_from_sl = structured_info_alt['population_code']
1054
+ method_used = "structured_lookup_alt_direct"
1055
+ print(f"Alternative '{alternative_query_word_cleaned}' found in structured lookup (direct match).")
1056
+ else:
1057
+ # Try heuristic lookup for alternative word
1058
+ alt_prefix, alt_num_str = _parse_individual_code_parts(alternative_query_word_cleaned)
1059
+ if alt_prefix is not None and alt_num_str is not None:
1060
+ try: alt_num = int(alt_num_str)
1061
+ except ValueError: alt_num = None
1062
+ if alt_num is not None:
1063
+ alt_prefix_upper = alt_prefix.upper()
1064
+ contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
1065
+ pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
1066
+ pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
1067
+ pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
1068
+ if alt_prefix_upper in contiguous_ranges:
1069
+ for start_num, end_num, pop_code_for_range in contiguous_ranges[alt_prefix_upper]:
1070
+ if start_num <= alt_num <= end_num:
1071
+ country_from_heuristic_alt = pop_code_to_country.get(pop_code_for_range, 'unknown')
1072
+ if country_from_heuristic_alt != 'unknown':
1073
+ if extracted_country == 'unknown':
1074
+ extracted_country = country_from_heuristic_alt
1075
+ if extracted_type == 'unknown':
1076
+ extracted_type = 'modern'
1077
+ # if extracted_ethnicity == 'unknown':
1078
+ # extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
1079
+ # if extracted_specific_location == 'unknown':
1080
+ # extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
1081
+ population_code_from_sl = pop_code_for_range
1082
+ method_used = "structured_lookup_alt_heuristic_range_match"
1083
+ break
1084
+ else:
1085
+ print(f"Alternative '{alternative_query_word_cleaned}' heuristic match found, but country unknown. Will fall to RAG below.")
1086
+ except:
1087
+ print("pass attempt 3 in model query")
1088
+ pass
1089
+ # use the context_for_llm to detect present_ancient before using llm model
1090
+ # retrieved_chunks_text = []
1091
+ # if document_chunks:
1092
+ # for idx in range(len(document_chunks)):
1093
+ # retrieved_chunks_text.append(document_chunks[idx])
1094
+ # context_for_llm = ""
1095
+ # all_context = "\n".join(retrieved_chunks_text) #
1096
+ # listOfcontexts = {"chunk": chunk,
1097
+ # "all_output": all_output,
1098
+ # "document_chunk": all_context}
1099
+ # label, context_for_llm = chooseContextLLM(listOfcontexts, query_word)
1100
+ # if not context_for_llm:
1101
+ # label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
1102
+ # if not context_for_llm:
1103
+ # context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
1104
+ # if context_for_llm:
1105
+ # extracted_type, explain = mtdna_classifier.detect_ancient_flag(context_for_llm)
1106
+ # extracted_type = extracted_type.lower()
1107
+ # sample_type_explanation = explain
1108
+ # 5. Execute RAG if needed (either full RAG or targeted RAG for missing fields)
1109
+
1110
+ # Determine if a RAG call is necessary
1111
+ # run_rag = (extracted_country == 'unknown' or extracted_type == 'unknown')# or \
1112
+ # #extracted_ethnicity == 'unknown' or extracted_specific_location == 'unknown')
1113
+ run_rag = True
1114
+ if run_rag:
1115
+ print("try run rag")
1116
+ # Determine the phrase for LLM query
1117
+ rag_query_phrase = f"'{query_word}'"
1118
+ if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
1119
+ rag_query_phrase += f" or its alternative word '{alternative_query_word_cleaned}'"
1120
+
1121
+ # Construct a more specific semantic query phrase for embedding if structured info is available
1122
+ semantic_query_for_embedding = rag_query_phrase # Default
1123
+ # if extracted_country != 'unknown': # If country is known from structured lookup (for targeted RAG)
1124
+ # if population_code_from_sl != 'unknown':
1125
+ # semantic_query_for_embedding = f"ethnicity and specific location for {query_word} population {population_code_from_sl} in {extracted_country}"
1126
+ # else: # If pop_code not found in structured, still use country hint
1127
+ # semantic_query_for_embedding = f"ethnicity and specific location for {query_word} in {extracted_country}"
1128
+ # print(f" DEBUG: Semantic query for embedding: '{semantic_query_for_embedding}'")
1129
+
1130
+
1131
+ # Determine fields to ask LLM for and output format based on what's known/needed
1132
+ prompt_instruction_prefix = ""
1133
+ output_format_str = ""
1134
+
1135
+ # Determine if it's a full RAG or targeted RAG scenario based on what's already extracted
1136
+ is_full_rag_scenario = True#(extracted_country == 'unknown')
1137
+
1138
+ if is_full_rag_scenario: # Full RAG scenario
1139
+ output_format_str = "country_name, modern/ancient/unknown"#, ethnicity, specific_location/unknown"
1140
+ method_used = "rag_llm"
1141
+ print(f"Proceeding to FULL RAG for {rag_query_phrase}.")
1142
+ # else: # Targeted RAG scenario (country/type already known, need ethnicity/specific_location)
1143
+ # if extracted_type == "unknown":
1144
+ # prompt_instruction_prefix = (
1145
+ # f"I already know the country is {extracted_country}. "
1146
+ # f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
1147
+ # )
1148
+ # #output_format_str = "modern/ancient/unknown, ethnicity, specific_location/unknown"
1149
+ # output_format_str = "modern/ancient/unknown"
1150
+ # # else:
1151
+ # # prompt_instruction_prefix = (
1152
+ # # f"I already know the country is {extracted_country} and the sample type is {extracted_type}. "
1153
+ # # f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
1154
+ # # )
1155
+ # # output_format_str = "ethnicity, specific_location/unknown"
1156
+
1157
+ # method_used = "hybrid_sl_rag"
1158
+ # print(f"Proceeding to TARGETED RAG for {rag_query_phrase}.")
1159
+
1160
+
1161
+ # Calculate embedding cost for the primary query word
1162
+ current_embedding_cost = 0
1163
+ try:
1164
+ query_embedding_vector = get_embedding(semantic_query_for_embedding, task_type="RETRIEVAL_QUERY")
1165
+ query_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(semantic_query_for_embedding).total_tokens
1166
+ current_embedding_cost += (query_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
1167
+ print(f" DEBUG: Query embedding tokens (for '{semantic_query_for_embedding}'): {query_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
1168
+
1169
+ if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
1170
+ alt_embedding_vector = get_embedding(alternative_query_word_cleaned, task_type="RETRIEVAL_QUERY")
1171
+ alt_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(alternative_query_word_cleaned).total_tokens
1172
+ current_embedding_cost += (alt_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
1173
+ print(f" DEBUG: Alternative query ('{alternative_query_word_cleaned}') embedding tokens: {alt_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
1174
+
1175
+ except Exception as e:
1176
+ print(f"Error getting query embedding for RAG: {e}")
1177
+ return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
1178
+
1179
+ if query_embedding_vector is None or query_embedding_vector.shape[0] == 0:
1180
+ return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
1181
+
1182
+ D, I = faiss_index.search(np.array([query_embedding_vector]), 4)
1183
+
1184
+ retrieved_chunks_text = []
1185
+ for idx in I[0]:
1186
+ if 0 <= idx < len(document_chunks):
1187
+ retrieved_chunks_text.append(document_chunks[idx])
1188
+
1189
+ context_for_llm = ""
1190
+
1191
+ all_context = "\n".join(retrieved_chunks_text) #
1192
+ listOfcontexts = {"chunk": chunk,
1193
+ "all_output": all_output,
1194
+ "document_chunk": all_context}
1195
+ label, context_for_llm = chooseContextLLM(listOfcontexts, query_word)
1196
+ if not context_for_llm:
1197
+ label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
1198
+ if not context_for_llm:
1199
+ context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
1200
+ #print("context for llm: ", label)
1201
+ # prompt_for_llm = (
1202
+ # f"{prompt_instruction_prefix}"
1203
+ # f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
1204
+ # f"Identify its primary associated country/geographic location. "
1205
+ # f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
1206
+ # f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
1207
+ # f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
1208
+ # f"Additionally, extract its ethnicity and a more specific location (city/district level) within the predicted country. "
1209
+ # f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
1210
+ # f"Provide only the country, sample type, ethnicity, and specific location, do not add extra explanations.\n\n"
1211
+ # f"Text Snippets:\n{context_for_llm}\n\n"
1212
+ # f"Output Format: {output_format_str}"
1213
+ # )
1214
+ if len(context_for_llm) > 1000*1000:
1215
+ context_for_llm = context_for_llm[:900000]
1216
+
1217
+ # fix the prompt better:
1218
+ # firstly clarify more by saying which type of organism, prioritize homo sapiens
1219
+ features = metadata["all_features"]
1220
+ organism = "general"
1221
+ if features != "unknown":
1222
+ if "organism" in features:
1223
+ try:
1224
+ organism = features.split("organism: ")[1].split("\n")[0]
1225
+ except:
1226
+ organism = features.replace("\n","; ")
1227
+ explain_list = "country or sample type (modern/ancient)" #or ethnicity or specific location (province/city)"
1228
+
1229
+ # prompt_for_llm = (
1230
+ # f"{prompt_instruction_prefix}"
1231
+ # f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
1232
+ # f"Identify its primary associated country/geographic location. "
1233
+ # f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
1234
+ # f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
1235
+ # f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
1236
+ # f"Provide only {output_format_str}. "
1237
+ # f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
1238
+ # f"If the country or sample type (modern/ancient) is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
1239
+ # f"\n\nText Snippets:\n{context_for_llm}\n\n"
1240
+ # f"Output Format: {output_format_str}"
1241
+ # )
1242
+
1243
+ # prompt_for_llm = (
1244
+ # f"{prompt_instruction_prefix}"
1245
+ # f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in {organism} if these specific identifiers are not explicitly found. "
1246
+ # f"Identify its primary associated country/geographic location. "
1247
+ # f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
1248
+ # f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
1249
+ # f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
1250
+ # f"Provide only {output_format_str}. "
1251
+ # f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
1252
+ # f"If the {explain_list} is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
1253
+ # f"\n\nText Snippets:\n{context_for_llm}\n\n"
1254
+ # f"Output Format: {output_format_str}"
1255
+ # )
1256
+ # prompt_for_llm = (
1257
+ # f"{prompt_instruction_prefix}"
1258
+ # f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
1259
+ # f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
1260
+ # f"Identify its **primary associated geographic location**, preferring the most specific available: "
1261
+ # f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
1262
+ # f"the next most specific region, continent, island, or other clear geographic area mentioned. "
1263
+ # f"If no geographic clues at all are present, state 'unknown' for location. "
1264
+ # f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
1265
+ # f"or 'ancient' (prehistoric/archaeological) source. "
1266
+ # f"If the text does not specify ancient or archaeological context, assume 'modern'. "
1267
+ # f"Provide only {output_format_str}. "
1268
+ # f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
1269
+ # f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text (one sentence for each)."
1270
+ # f"\n\nText Snippets:\n{context_for_llm}\n\n"
1271
+ # f"Output Format: {output_format_str}"
1272
+ # )
1273
+ prompt_for_llm = (
1274
+ f"{prompt_instruction_prefix}"
1275
+ f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
1276
+ f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
1277
+ f"Identify its **primary associated geographic location**, preferring the most specific available: "
1278
+ f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
1279
+ f"the next most specific region, continent, island, or other clear geographic area mentioned. "
1280
+ f"If no geographic clues at all are present, state 'unknown' for location. "
1281
+ f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
1282
+ f"or 'ancient' (prehistoric/archaeological) source. "
1283
+ f"If the text does not specify ancient or archaeological context, assume 'modern'. "
1284
+ f"Provide only {output_format_str}. "
1285
+ f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
1286
+ f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text "
1287
+ f"(one sentence for each). "
1288
+ f"Format your answer so that:\n"
1289
+ f"1. The **first line** contains only the {output_format_str} answer.\n"
1290
+ f"2. The **second line onward** contains the explanations.\n"
1291
+ f"\nText Snippets:\n{context_for_llm}\n\n"
1292
+ f"Output Format Example:\nChina, modern, Daur, Heilongjiang province.\n"
1293
+ f"The text explicitly states \"chinese Daur ethnic group in Heilongjiang province\", indicating the country, "
1294
+ f"the ethnicity, and the specific province. The study is published in a journal, implying research on living "
1295
+ f"individuals, hence modern."
1296
+ )
1297
+
1298
+ if model_ai:
1299
+ print("back up to ", model_ai)
1300
+ llm_response_text, model_instance = call_llm_api(prompt_for_llm, model=model_ai)
1301
+ else:
1302
+ print("still 2.5 flash gemini")
1303
+ llm_response_text, model_instance = call_llm_api(prompt_for_llm)
1304
+ print("\n--- DEBUG INFO FOR RAG ---")
1305
+ print("Retrieved Context Sent to LLM (first 500 chars):")
1306
+ print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
1307
+ print("\nRaw LLM Response:")
1308
+ print(llm_response_text)
1309
+ print("--- END DEBUG INFO ---")
1310
+
1311
+ llm_cost = 0
1312
+ if model_instance:
1313
+ try:
1314
+ input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
1315
+ output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
1316
+ print(f" DEBUG: LLM Input tokens: {input_llm_tokens}")
1317
+ print(f" DEBUG: LLM Output tokens: {output_llm_tokens}")
1318
+ llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
1319
+ (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
1320
+ print(f" DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
1321
+ except Exception as e:
1322
+ print(f" DEBUG: Error counting LLM tokens: {e}")
1323
+ llm_cost = 0
1324
+
1325
+ total_query_cost += current_embedding_cost + llm_cost
1326
+ print(f" DEBUG: Total estimated cost for this RAG query: ${total_query_cost:.6f}")
1327
+ # Parse the LLM's response based on the Output Format actually used
1328
+ # if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
1329
+ # extracted_ethnicity,extracted_specific_location = clean_llm_output(llm_response_text, output_format_str)
1330
+ # elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
1331
+ # extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str)
1332
+ # else: # Full RAG output (country, type, ethnicity, specific_location)
1333
+ # extracted_country,extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str)
1334
+ metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
1335
+ # merge_metadata = merge_metadata_outputs(metadata_list)
1336
+ # if output_format_str == "country_name, modern/ancient/unknown":
1337
+ # extracted_country, extracted_type = merge_metadata["country"], merge_metadata["sample_type"]
1338
+ # country_explanation,sample_type_explanation = merge_metadata["country_explanation"], merge_metadata["sample_type_explanation"]
1339
+ # elif output_format_str == "modern/ancient/unknown":
1340
+ # extracted_type = merge_metadata["sample_type"]
1341
+ # sample_type_explanation = merge_metadata["sample_type_explanation"]
1342
+ # for the output_format that is not default
1343
+ if output_format_str == "country_name, modern/ancient/unknown":
1344
+ outputs = output_format_str.split(", ")
1345
+ extracted_country, extracted_type = metadata_list[outputs[0]]["answer"], metadata_list[outputs[1]]["answer"]
1346
+ country_explanation,sample_type_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"], metadata_list[outputs[1]][outputs[1]+"_explanation"]
1347
+ # extracted_ethnicity, extracted_specific_location = metadata_list[outputs[2]]["answer"], metadata_list[outputs[3]]["answer"]
1348
+ # ethnicity_explanation, specific_loc_explanation = metadata_list[outputs[2]][outputs[2]+"_explanation"], metadata_list[outputs[3]][outputs[3]+"_explanation"]
1349
+ # 6. Optional: Second LLM call for specific_location from general knowledge if still unknown
1350
+ # if extracted_specific_location == 'unknown':
1351
+ # # Check if we have enough info to ask general knowledge LLM
1352
+ # if extracted_country != 'unknown' and extracted_ethnicity != 'unknown':
1353
+ # print(f" DEBUG: Specific location still unknown. Querying general knowledge LLM from '{extracted_ethnicity}' and '{extracted_country}'.")
1354
+
1355
+ # general_knowledge_prompt = (
1356
+ # f"Based on general knowledge, what is a highly specific location (city or district) "
1357
+ # f"associated with the ethnicity '{extracted_ethnicity}' in '{extracted_country}'? "
1358
+ # f"Consider the context of scientific studies on human genetics, if known. "
1359
+ # f"If no common specific location is known, state 'unknown'. "
1360
+ # f"Provide only the city or district name, or 'unknown'."
1361
+ # )
1362
+
1363
+ # general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
1364
+
1365
+ # if general_llm_response and general_llm_response.lower().strip() != 'unknown':
1366
+ # extracted_specific_location = general_llm_response.strip() + " (predicted from general knowledge)"
1367
+ # # Add cost of this second LLM call
1368
+ # if general_llm_model_instance:
1369
+ # try:
1370
+ # gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
1371
+ # gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
1372
+ # gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
1373
+ # (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
1374
+ # print(f" DEBUG: General Knowledge LLM cost to predict specific location alone: ${gk_cost:.6f}")
1375
+ # total_query_cost += gk_cost # Accumulate cost
1376
+ # except Exception as e:
1377
+ # print(f" DEBUG: Error counting GK LLM tokens: {e}")
1378
+ # else:
1379
+ # print(" DEBUG: General knowledge LLM returned unknown or empty for specific location.")
1380
+ # # 6. Optional: Second LLM call for ethnicity from general knowledge if still unknown
1381
+ # if extracted_ethnicity == 'unknown':
1382
+ # # Check if we have enough info to ask general knowledge LLM
1383
+ # if extracted_country != 'unknown' and extracted_specific_location != 'unknown':
1384
+ # print(f" DEBUG: Ethnicity still unknown. Querying general knowledge LLM from '{extracted_specific_location}' and '{extracted_country}'.")
1385
+
1386
+ # general_knowledge_prompt = (
1387
+ # f"Based on general knowledge, what is a highly ethnicity (population) "
1388
+ # f"associated with the specific location '{extracted_specific_location}' in '{extracted_country}'? "
1389
+ # f"Consider the context of scientific studies on human genetics, if known. "
1390
+ # f"If no common ethnicity is known, state 'unknown'. "
1391
+ # f"Provide only the ethnicity or popluation name, or 'unknown'."
1392
+ # )
1393
+
1394
+ # general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
1395
+
1396
+ # if general_llm_response and general_llm_response.lower().strip() != 'unknown':
1397
+ # extracted_ethnicity = general_llm_response.strip() + " (predicted from general knowledge)"
1398
+ # # Add cost of this second LLM call
1399
+ # if general_llm_model_instance:
1400
+ # try:
1401
+ # gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
1402
+ # gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
1403
+ # gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
1404
+ # (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
1405
+ # print(f" DEBUG: General Knowledge LLM cost to predict ethnicity alone: ${gk_cost:.6f}")
1406
+ # total_query_cost += gk_cost # Accumulate cost
1407
+ # except Exception as e:
1408
+ # print(f" DEBUG: Error counting GK LLM tokens: {e}")
1409
+ # else:
1410
+ # print(" DEBUG: General knowledge LLM returned unknown or empty for ethnicity.")
1411
+
1412
+
1413
+ #return extracted_country, extracted_type, method_used, extracted_ethnicity, extracted_specific_location, total_query_cost
1414
+ return extracted_country, extracted_type, method_used, country_explanation, sample_type_explanation, total_query_cost
core/mtdna_backend.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile, json, re
2
+ # import io
3
+
4
+ # from app import send_log
5
+
6
+ from core.mtdna_classifier import classify_sample_location
7
+ import core.pipeline
8
+ import pandas as pd
9
+
10
+ import gspread
11
+ from oauth2client.service_account import ServiceAccountCredentials
12
+ import hashlib
13
+
14
+ # ✅ Load credentials from Hugging Face secret and ✅ Setup Google Sheets
15
+ creds_dict = json.loads(os.environ["GCP_CREDS_JSON"])
16
+ scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
17
+ creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
18
+ client = gspread.authorize(creds)
19
+
20
+ # ✅ Extract accessions from input
21
+ def extract_accessions_from_input(file=None, raw_text=""):
22
+ # print(f"RAW TEXT RECEIVED: {raw_text}")
23
+ accessions = []
24
+ seen = set()
25
+ if file:
26
+ try:
27
+ if file.name.endswith(".csv"):
28
+ df = pd.read_csv(file)
29
+ elif file.name.endswith(".xlsx"):
30
+ df = pd.read_excel(file)
31
+ else:
32
+ return [], "Unsupported file format. Please upload CSV or Excel."
33
+ for acc in df.iloc[:, 0].dropna().astype(str).str.strip():
34
+ if acc not in seen:
35
+ accessions.append(acc)
36
+ seen.add(acc)
37
+ except Exception as e:
38
+ return [], f"Failed to read file: {e}"
39
+
40
+ if raw_text:
41
+ text_ids = [s.strip() for s in re.split(r"[\n,;\t]", raw_text) if s.strip()]
42
+ for acc in text_ids:
43
+ if acc not in seen:
44
+ accessions.append(acc)
45
+ seen.add(acc)
46
+
47
+ return list(accessions), None
48
+
49
+ # ✅ Load and save usage count
50
+ def hash_user_id(user_input):
51
+ return hashlib.sha256(user_input.encode()).hexdigest()
52
+
53
+ def load_user_usage():
54
+ try:
55
+ # ✅ Access user_usage_log sheet1 on Google Sheets
56
+ sheet = client.open("user_usage_log").sheet1
57
+ data = sheet.get_all_values()
58
+ # print("data: ", data)
59
+ # print("🧪 Raw header row from sheet:", data[0])
60
+ # print("🧪 Character codes in each header:")
61
+ # for h in data[0]:
62
+ # print([ord(c) for c in h])
63
+
64
+ if not data or len(data) < 2:
65
+ # ph in data[0]:
66
+ # print("⚠️ Sheet is empty or missing rows.")
67
+ return {}
68
+
69
+ headers = [h.strip().lower() for h in data[0]]
70
+ if "email" not in headers or "usage_count" not in headers:
71
+ # print("❌ Header format incorrect. Must have 'email' and 'usage_count'.")
72
+ return {}
73
+
74
+ permitted_index = headers.index("permitted_samples") if "permitted_samples" in headers else None
75
+ df = pd.DataFrame(data[1:], columns=headers)
76
+
77
+ usage = {}
78
+ permitted = {}
79
+ for _, row in df.iterrows():
80
+ email = row.get("email", "").strip().lower()
81
+ try:
82
+ #count = int(row.get("usage_count", 0))
83
+ try:
84
+ count = int(float(row.get("usage_count", 0)))
85
+ except Exception:
86
+ # print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
87
+ count = 0
88
+
89
+ if email:
90
+ usage[email] = count
91
+ if permitted_index is not None:
92
+ try:
93
+ permitted_count = int(float(row.get("permitted_samples", 50)))
94
+ permitted[email] = permitted_count
95
+ except:
96
+ permitted[email] = 50
97
+
98
+ except ValueError:
99
+ print(f"⚠️ Invalid usage_count for {email}: {row.get('usage_count')}")
100
+ return usage, permitted
101
+
102
+ except Exception as e:
103
+ print(f"❌ Error in load_user_usage: {e}")
104
+ return {}, {}
105
+
106
+ def save_user_usage(usage_dict):
107
+ try:
108
+ # ✅ Access user_usage_log on Google Sheets
109
+ sheet = client.open("user_usage_log").sheet1
110
+
111
+ # Build new df
112
+ df_new = pd.DataFrame(list(usage_dict.items()), columns=["email", "usage_count"])
113
+ df_new["email"] = df_new["email"].str.strip().str.lower()
114
+ df_new["usage_count"] = pd.to_numeric(df_new["usage_count"], errors="coerce").fillna(0).astype(int)
115
+
116
+ # Read existing data
117
+ existing_data = sheet.get_all_values()
118
+ if existing_data and len(existing_data[0]) >= 2:
119
+ df_old = pd.DataFrame(existing_data[1:], columns=existing_data[0])
120
+ df_old["email"] = df_old["email"].str.strip().str.lower()
121
+ df_old["usage_count"] = pd.to_numeric(df_old["usage_count"], errors="coerce").fillna(0).astype(int)
122
+ else:
123
+ df_old = pd.DataFrame(columns=["email", "usage_count"])
124
+
125
+ # ✅ Overwrite specific emails only
126
+ df_old = df_old.set_index("email")
127
+ for email, count in usage_dict.items():
128
+ email = email.strip().lower()
129
+ df_old.loc[email, "usage_count"] = count
130
+ df_old = df_old.reset_index()
131
+
132
+ # Save
133
+ sheet.clear()
134
+ sheet.update([df_old.columns.tolist()] + df_old.astype(str).values.tolist())
135
+ print("✅ Saved user usage to user_usage_log sheet.")
136
+
137
+ except Exception as e:
138
+ print(f"❌ Failed to save user usage to Google Sheets: {e}")
139
+
140
+ def increment_usage(email: str, count: int = 1):
141
+ usage, permitted = load_user_usage()
142
+ email_key = email.strip().lower()
143
+ #usage[email_key] = usage.get(email_key, 0) + count
144
+ current = usage.get(email_key, 0)
145
+ new_value = current + count
146
+ max_allowed = permitted.get(email_key) or 50
147
+ usage[email_key] = max(current, new_value) # ✅ Prevent overwrite with lower
148
+ # print(f"🧪 increment_usage saving: {email_key=} {current=} + {count=} => {usage[email_key]=}")
149
+ # print("max allow is: ", max_allowed)
150
+ save_user_usage(usage)
151
+ return usage[email_key], max_allowed
152
+
153
+ # ✅ Save user feedbacks
154
+ def store_feedback_to_google_sheets(accession, answer1, answer2, contact=""):
155
+ if not answer1.strip() or not answer2.strip():
156
+ return "⚠️ Please answer both questions before submitting."
157
+
158
+ try:
159
+ # Access feedback_mtdna sheet1 on Google Sheet
160
+ sheet = client.open("feedback_mtdna").sheet1 # make sure sheet name matches
161
+
162
+ # Append feedback
163
+ sheet.append_row([accession, answer1, answer2, contact])
164
+ return "✅ Feedback submitted. Thank you!"
165
+
166
+ except Exception as e:
167
+ return f"❌ Error submitting feedback: {e}"
168
+
169
+ # ✅ save cost by checking the known outputs
170
+ def check_known_output(accession):
171
+ # print("inside check known output function")
172
+ try:
173
+ # ✅ Access known_samples sheet1 on Google Sheet
174
+ sheet = client.open("known_samples").sheet1
175
+
176
+ data = sheet.get_all_values()
177
+ if not data:
178
+ # print("⚠️ Google Sheet 'known_samples' is empty.")
179
+ return None
180
+
181
+ df = pd.DataFrame(data[1:], columns=data[0])
182
+ if "Sample ID" not in df.columns:
183
+ # print("❌ Column 'Sample ID' not found in Google Sheet.")
184
+ return None
185
+
186
+ match = re.search(r"\b[A-Z]{2,4}\d{4,}", accession)
187
+ if match:
188
+ accession = match.group(0)
189
+
190
+ matched = df[df["Sample ID"].str.contains(accession, case=False, na=False)]
191
+ if not matched.empty:
192
+ #return matched.iloc[0].to_dict()
193
+ row = matched.iloc[0]
194
+ country = row.get("Predicted Country", "").strip().lower()
195
+ sample_type = row.get("Predicted Sample Type", "").strip().lower()
196
+
197
+ if country and country != "unknown" and sample_type and sample_type != "unknown":
198
+ return row.to_dict()
199
+ else:
200
+ # print(f"⚠️ Accession {accession} found but country/sample_type is unknown or empty.")
201
+ return None
202
+ else:
203
+ # print(f"🔍 Accession {accession} not found in known_samples.")
204
+ return None
205
+
206
+ except Exception as e:
207
+ import traceback
208
+ # print("❌ Exception occurred during check_known_output:")
209
+ # traceback.print_exc()
210
+ return None
211
+
212
+ # Add a new helper to backend: `filter_unprocessed_accessions()`
213
+ def get_incomplete_accessions(file_path):
214
+ df = pd.read_excel(file_path)
215
+
216
+ incomplete_accessions = []
217
+ for _, row in df.iterrows():
218
+ sample_id = str(row.get("Sample ID", "")).strip()
219
+
220
+ # Skip if no sample ID
221
+ if not sample_id:
222
+ continue
223
+
224
+ # Drop the Sample ID and check if the rest is empty
225
+ other_cols = row.drop(labels=["Sample ID"], errors="ignore")
226
+ if other_cols.isna().all() or (other_cols.astype(str).str.strip() == "").all():
227
+ # Extract the accession number from the sample ID using regex
228
+ match = re.search(r"\b[A-Z]{2,4}\d{4,}", sample_id)
229
+ if match:
230
+ incomplete_accessions.append(match.group(0))
231
+ # print(len(incomplete_accessions))
232
+ return incomplete_accessions
233
+
234
+ # Small pipeline wrapper
235
+ def pipeline_classify_sample_location_cached(accession,stop_flag=None, save_df=None):
236
+ # print("inside pipeline_classify_sample_location_cached, and [accession] is ", [accession])
237
+ # print("len of save df: ", len(save_df))
238
+ return pipeline.pipeline_with_gemini([accession],stop_flag=stop_flag, save_df=save_df)
239
+
240
+ def summarize_results(accession, stop_flag=None):
241
+ # Early bail
242
+ if stop_flag is not None and stop_flag.value:
243
+ # print(f"🛑 Skipping {accession} before starting.")
244
+ return []
245
+ # try cache first
246
+ cached = check_known_output(accession)
247
+ if cached:
248
+ # print(f"✅ Using cached result for {accession}")
249
+ return cached
250
+ # only run when nothing in the cache
251
+ try:
252
+ sheet = client.open("known_samples").sheet1
253
+
254
+ data = sheet.get_all_values()
255
+ if not data:
256
+ # print("⚠️ Google Sheet 'known_samples' is empty.")
257
+ return None
258
+
259
+ save_df = pd.DataFrame(data[1:], columns=data[0])
260
+ # print("before pipeline, len of save df: ", len(save_df))
261
+ if stop_flag is not None and stop_flag.value:
262
+ # print(f"🛑 Skipped {accession} mid-pipeline.")
263
+ return []
264
+ else:
265
+ outputs = pipeline_classify_sample_location_cached(accession, stop_flag, save_df)
266
+ # outputs = {'KU131308':
267
+ # {'isolate':'BRU18',
268
+ # 'country':
269
+ # {'brunei': ['ncbi',
270
+ # 'rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.'
271
+ # ]},
272
+ # 'sample_type':
273
+ # {'modern':
274
+ # ['rag_llm-The text mentions "BRU18 Brunei Borneo" in a table listing various samples, and it is not described as ancient or archaeological.'
275
+ # ]},
276
+ # 'query_cost': 9.754999999999999e-05,
277
+ # 'time_cost': '24.776 seconds',
278
+ # 'source':
279
+ # ['https://doi.org/10.1007/s00439-015-1620-z',
280
+ # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM1_ESM.pdf',
281
+ # 'https://static-content.springer.com/esm/art%3A10.1007%2Fs00439-015-1620-z/MediaObjects/439_2015_1620_MOESM2_ESM.xls']}}
282
+ except Exception as e:
283
+ return []#, f"Error: {e}", f"Error: {e}", f"Error: {e}"
284
+
285
+ if accession not in outputs:
286
+ # print("no accession in output ", accession)
287
+ return []#, "Accession not found in results.", "Accession not found in results.", "Accession not found in results."
288
+
289
+ row_score = []
290
+ rows = []
291
+ save_rows = []
292
+ for key in outputs:
293
+ pred_country, pred_sample, country_explanation, sample_explanation = "unknown","unknown","unknown","unknown"
294
+ for section, results in outputs[key].items():
295
+ if section == "country" or section =="sample_type":
296
+ pred_output = []#"\n".join(list(results.keys()))
297
+ output_explanation = ""
298
+ for result, content in results.items():
299
+ if len(result) == 0: result = "unknown"
300
+ if len(content) == 0: output_explanation = "unknown"
301
+ else:
302
+ output_explanation += 'Method: ' + "\nMethod: ".join(content) + "\n"
303
+ pred_output.append(result)
304
+ pred_output = "\n".join(pred_output)
305
+ if section == "country":
306
+ pred_country, country_explanation = pred_output, output_explanation
307
+ elif section == "sample_type":
308
+ pred_sample, sample_explanation = pred_output, output_explanation
309
+ if outputs[key]["isolate"].lower()!="unknown":
310
+ label = key + "(Isolate: " + outputs[key]["isolate"] + ")"
311
+ else: label = key
312
+ if len(outputs[key]["source"]) == 0: outputs[key]["source"] = ["No Links"]
313
+ # row = {
314
+ # "sample_id": label or "unknown",
315
+ # "predicted_country": pred_country or "unknown",
316
+ # "country_explanation": country_explanation or "unknown",
317
+ # "predicted_sample_type":pred_sample or "unknown",
318
+ # "sample_type_explanation":sample_explanation or "unknown",
319
+ # "sources": "\n".join(outputs[key]["source"]) or "No Links",
320
+ # "time_cost": outputs[key]["time_cost"]
321
+ # }
322
+ #row_score.append(row)
323
+ # rows.append(list(row.values()))
324
+
325
+ save_row = {
326
+ "Sample ID": label or "unknown",
327
+ "Predicted Country": pred_country or "unknown",
328
+ "Country Explanation": country_explanation or "unknown",
329
+ "Predicted Sample Type":pred_sample or "unknown",
330
+ "Sample Type Explanation":sample_explanation or "unknown",
331
+ "Sources": "\n".join(outputs[key]["source"]) or "No Links",
332
+ "Query_cost": outputs[key]["query_cost"] or "",
333
+ "Time cost": outputs[key]["time_cost"] or "",
334
+ "file_chunk":outputs[key]["file_chunk"] or "",
335
+ "file_all_output":outputs[key]["file_all_output"] or ""
336
+ }
337
+ # #row_score.append(row)
338
+ save_rows.append(save_row)
339
+
340
+ return save_rows[0] #, summary, labelAncient_Modern, explain_label
341
+
342
+ # def run_each_accessions(accession)
343
+
344
+ # save the batch output in excel file
345
+ def save_to_excel(all_rows, summary_text, flag_text, filename, is_resume=False):
346
+ df_new = pd.DataFrame(all_rows, columns=[
347
+ "Sample ID", "Predicted Country", "Country Explanation",
348
+ "Predicted Sample Type", "Sample Type Explanation",
349
+ "Sources", "Time cost"
350
+ ])
351
+
352
+ if is_resume and os.path.exists(filename):
353
+ try:
354
+ df_old = pd.read_excel(filename)
355
+ except Exception as e:
356
+ # print(f"⚠️ Warning reading old Excel file: {e}")
357
+ df_old = pd.DataFrame(columns=df_new.columns)
358
+
359
+ # Set index and update existing rows
360
+ df_old.set_index("Sample ID", inplace=True)
361
+ df_new.set_index("Sample ID", inplace=True)
362
+ df_old.update(df_new)
363
+
364
+ df_combined = df_old.reset_index()
365
+ else:
366
+ # If not resuming or file doesn't exist, just use new rows
367
+ df_combined = df_new
368
+
369
+ # try:
370
+ df_combined.to_excel(filename, index=False)
371
+ # except Exception as e:
372
+ # print(f"❌ Failed to write Excel file {filename}: {e}")
373
+
374
+
375
+ # save the batch output in JSON file
376
+ def save_to_json(all_rows, summary_text, flag_text, filename):
377
+ output_dict = {
378
+ "Detailed_Results": all_rows#, # <-- make sure this is a plain list, not a DataFrame
379
+ # "Summary_Text": summary_text,
380
+ # "Ancient_Modern_Flag": flag_text
381
+ }
382
+
383
+ # If all_rows is a DataFrame, convert it
384
+ if isinstance(all_rows, pd.DataFrame):
385
+ output_dict["Detailed_Results"] = all_rows.to_dict(orient="records")
386
+
387
+ with open(filename, "w") as external_file:
388
+ json.dump(output_dict, external_file, indent=2)
389
+
390
+ # save the batch output in Text file
391
+ def save_to_txt(all_rows, summary_text, flag_text, filename):
392
+ if isinstance(all_rows, pd.DataFrame):
393
+ detailed_results = all_rows.to_dict(orient="records")
394
+ output = ""
395
+ #output += ",".join(list(detailed_results[0].keys())) + "\n\n"
396
+ output += ",".join([str(k) for k in detailed_results[0].keys()]) + "\n\n"
397
+ for r in detailed_results:
398
+ output += ",".join([str(v) for v in r.values()]) + "\n\n"
399
+ with open(filename, "w") as f:
400
+ f.write("=== Detailed Results ===\n")
401
+ f.write(output + "\n")
402
+
403
+ def save_batch_output(all_rows, output_type, summary_text=None, flag_text=None):
404
+ tmp_dir = tempfile.mkdtemp()
405
+
406
+ #html_table = all_rows.value # assuming this is stored somewhere
407
+
408
+ # Parse back to DataFrame
409
+ #all_rows = pd.read_html(all_rows)[0] # [0] because read_html returns a list
410
+ all_rows = pd.read_html(StringIO(all_rows))[0]
411
+ # print(all_rows)
412
+
413
+ if output_type == "Excel":
414
+ file_path = f"{tmp_dir}/batch_output.xlsx"
415
+ save_to_excel(all_rows, summary_text, flag_text, file_path)
416
+ elif output_type == "JSON":
417
+ file_path = f"{tmp_dir}/batch_output.json"
418
+ save_to_json(all_rows, summary_text, flag_text, file_path)
419
+ # print("Done with JSON")
420
+ elif output_type == "TXT":
421
+ file_path = f"{tmp_dir}/batch_output.txt"
422
+ save_to_txt(all_rows, summary_text, flag_text, file_path)
423
+ else:
424
+ return gr.update(visible=False) # invalid option
425
+
426
+ return gr.update(value=file_path, visible=True)
core/mtdna_classifier.py ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mtDNA Location Classifier MVP (Google Colab)
2
+ # Accepts accession number → Fetches PubMed ID + isolate name → Gets abstract → Predicts location
3
+ import os
4
+ #import streamlit as st
5
+ import subprocess
6
+ import re
7
+ from Bio import Entrez
8
+ import fitz
9
+ import spacy
10
+ from spacy.cli import download
11
+ # from core.NER.PDF import pdf
12
+ # from core.NER.WordDoc import wordDoc
13
+ # from core.NER.html import extractHTML
14
+ # from core.NER.word2Vec import word2vec
15
+ from transformers import pipeline
16
+ import urllib.parse, requests
17
+ from pathlib import Path
18
+ from core.upgradeClassify import filter_context_for_sample, infer_location_for_sample
19
+
20
+ # Set your email (required by NCBI Entrez)
21
+ #Entrez.email = "your-email@example.com"
22
+ import nltk
23
+
24
+ nltk.download("stopwords")
25
+ nltk.download("punkt")
26
+ nltk.download('punkt_tab')
27
+ # Step 1: Get PubMed ID from Accession using EDirect
28
+ from Bio import Entrez, Medline
29
+ import re
30
+
31
+ Entrez.email = "your_email@example.com"
32
+
33
+ # --- Helper Functions (Re-organized and Upgraded) ---
34
+
35
+ def fetch_ncbi_metadata(accession_number):
36
+ """
37
+ Fetches metadata directly from NCBI GenBank using Entrez.
38
+ Includes robust error handling and improved field extraction.
39
+ Prioritizes location extraction from geo_loc_name, then notes, then other qualifiers.
40
+ Also attempts to extract ethnicity and sample_type (ancient/modern).
41
+
42
+ Args:
43
+ accession_number (str): The NCBI accession number (e.g., "ON792208").
44
+
45
+ Returns:
46
+ dict: A dictionary containing 'country', 'specific_location', 'ethnicity',
47
+ 'sample_type', 'collection_date', 'isolate', 'title', 'doi', 'pubmed_id'.
48
+ """
49
+ Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
50
+
51
+ country = "unknown"
52
+ specific_location = "unknown"
53
+ ethnicity = "unknown"
54
+ sample_type = "unknown"
55
+ collection_date = "unknown"
56
+ isolate = "unknown"
57
+ title = "unknown"
58
+ doi = "unknown"
59
+ pubmed_id = None
60
+ all_feature = "unknown"
61
+
62
+ KNOWN_COUNTRIES = [
63
+ "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan",
64
+ "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi",
65
+ "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)", "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia",
66
+ "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
67
+ "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana",
68
+ "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan",
69
+ "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
70
+ "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar",
71
+ "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman",
72
+ "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
73
+ "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria",
74
+ "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu",
75
+ "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
76
+ "Yemen", "Zambia", "Zimbabwe"
77
+ ]
78
+ COUNTRY_PATTERN = re.compile(r'\b(' + '|'.join(re.escape(c) for c in KNOWN_COUNTRIES) + r')\b', re.IGNORECASE)
79
+
80
+ try:
81
+ handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
82
+ record = Entrez.read(handle)
83
+ handle.close()
84
+
85
+ gb_seq = None
86
+ # Validate record structure: It should be a list with at least one element (a dict)
87
+ if isinstance(record, list) and len(record) > 0:
88
+ if isinstance(record[0], dict):
89
+ gb_seq = record[0]
90
+ else:
91
+ print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
92
+ else:
93
+ print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
94
+
95
+ # If gb_seq is still None, return defaults
96
+ if gb_seq is None:
97
+ return {"country": "unknown",
98
+ "specific_location": "unknown",
99
+ "ethnicity": "unknown",
100
+ "sample_type": "unknown",
101
+ "collection_date": "unknown",
102
+ "isolate": "unknown",
103
+ "title": "unknown",
104
+ "doi": "unknown",
105
+ "pubmed_id": None,
106
+ "all_features": "unknown"}
107
+
108
+
109
+ # If gb_seq is valid, proceed with extraction
110
+ collection_date = gb_seq.get("GBSeq_create-date","unknown")
111
+
112
+ references = gb_seq.get("GBSeq_references", [])
113
+ for ref in references:
114
+ if not pubmed_id:
115
+ pubmed_id = ref.get("GBReference_pubmed",None)
116
+ if title == "unknown":
117
+ title = ref.get("GBReference_title","unknown")
118
+ for xref in ref.get("GBReference_xref", []):
119
+ if xref.get("GBXref_dbname") == "doi":
120
+ doi = xref.get("GBXref_id")
121
+ break
122
+
123
+ features = gb_seq.get("GBSeq_feature-table", [])
124
+
125
+ context_for_flagging = "" # Accumulate text for ancient/modern detection
126
+ features_context = ""
127
+ for feature in features:
128
+ if feature.get("GBFeature_key") == "source":
129
+ feature_context = ""
130
+ qualifiers = feature.get("GBFeature_quals", [])
131
+ found_country = "unknown"
132
+ found_specific_location = "unknown"
133
+ found_ethnicity = "unknown"
134
+
135
+ temp_geo_loc_name = "unknown"
136
+ temp_note_origin_locality = "unknown"
137
+ temp_country_qual = "unknown"
138
+ temp_locality_qual = "unknown"
139
+ temp_collection_location_qual = "unknown"
140
+ temp_isolation_source_qual = "unknown"
141
+ temp_env_sample_qual = "unknown"
142
+ temp_pop_qual = "unknown"
143
+ temp_organism_qual = "unknown"
144
+ temp_specimen_qual = "unknown"
145
+ temp_strain_qual = "unknown"
146
+
147
+ for qual in qualifiers:
148
+ qual_name = qual.get("GBQualifier_name")
149
+ qual_value = qual.get("GBQualifier_value")
150
+ feature_context += qual_name + ": " + qual_value +"\n"
151
+ if qual_name == "collection_date":
152
+ collection_date = qual_value
153
+ elif qual_name == "isolate":
154
+ isolate = qual_value
155
+ elif qual_name == "population":
156
+ temp_pop_qual = qual_value
157
+ elif qual_name == "organism":
158
+ temp_organism_qual = qual_value
159
+ elif qual_name == "specimen_voucher" or qual_name == "specimen":
160
+ temp_specimen_qual = qual_value
161
+ elif qual_name == "strain":
162
+ temp_strain_qual = qual_value
163
+ elif qual_name == "isolation_source":
164
+ temp_isolation_source_qual = qual_value
165
+ elif qual_name == "environmental_sample":
166
+ temp_env_sample_qual = qual_value
167
+
168
+ if qual_name == "geo_loc_name": temp_geo_loc_name = qual_value
169
+ elif qual_name == "note":
170
+ if qual_value.startswith("origin_locality:"):
171
+ temp_note_origin_locality = qual_value
172
+ context_for_flagging += qual_value + " " # Capture all notes for flagging
173
+ elif qual_name == "country": temp_country_qual = qual_value
174
+ elif qual_name == "locality": temp_locality_qual = qual_value
175
+ elif qual_name == "collection_location": temp_collection_location_qual = qual_value
176
+
177
+
178
+ # --- Aggregate all relevant info into context_for_flagging ---
179
+ context_for_flagging += f" {isolate} {temp_isolation_source_qual} {temp_specimen_qual} {temp_strain_qual} {temp_organism_qual} {temp_geo_loc_name} {temp_collection_location_qual} {temp_env_sample_qual}"
180
+ context_for_flagging = context_for_flagging.strip()
181
+
182
+ # --- Determine final country and specific_location based on priority ---
183
+ if temp_geo_loc_name != "unknown":
184
+ parts = [p.strip() for p in temp_geo_loc_name.split(':')]
185
+ if len(parts) > 1:
186
+ found_specific_location = parts[-1]; found_country = parts[0]
187
+ else: found_country = temp_geo_loc_name; found_specific_location = "unknown"
188
+ elif temp_note_origin_locality != "unknown":
189
+ match = re.search(r"origin_locality:\s*(.*)", temp_note_origin_locality, re.IGNORECASE)
190
+ if match:
191
+ location_string = match.group(1).strip()
192
+ parts = [p.strip() for p in location_string.split(':')]
193
+ if len(parts) > 1: found_country = parts[-1]; found_specific_location = parts[0]
194
+ else: found_country = location_string; found_specific_location = "unknown"
195
+ elif temp_locality_qual != "unknown":
196
+ found_country_match = COUNTRY_PATTERN.search(temp_locality_qual)
197
+ if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_locality_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
198
+ else: found_specific_location = temp_locality_qual; found_country = "unknown"
199
+ elif temp_collection_location_qual != "unknown":
200
+ found_country_match = COUNTRY_PATTERN.search(temp_collection_location_qual)
201
+ if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_collection_location_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
202
+ else: found_specific_location = temp_collection_location_qual; found_country = "unknown"
203
+ elif temp_isolation_source_qual != "unknown":
204
+ found_country_match = COUNTRY_PATTERN.search(temp_isolation_source_qual)
205
+ if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_isolation_source_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
206
+ else: found_specific_location = temp_isolation_source_qual; found_country = "unknown"
207
+ elif temp_env_sample_qual != "unknown":
208
+ found_country_match = COUNTRY_PATTERN.search(temp_env_sample_qual)
209
+ if found_country_match: found_country = found_country_match.group(1); temp_loc = re.sub(re.escape(found_country), '', temp_env_sample_qual, flags=re.IGNORECASE).strip().replace(',', '').replace(':', '').replace(';', '').strip(); found_specific_location = temp_loc if temp_loc else "unknown"
210
+ else: found_specific_location = temp_env_sample_qual; found_country = "unknown"
211
+ if found_country == "unknown" and temp_country_qual != "unknown":
212
+ found_country_match = COUNTRY_PATTERN.search(temp_country_qual)
213
+ if found_country_match: found_country = found_country_match.group(1)
214
+
215
+ country = found_country
216
+ specific_location = found_specific_location
217
+ # --- Determine final ethnicity ---
218
+ if temp_pop_qual != "unknown":
219
+ found_ethnicity = temp_pop_qual
220
+ elif isolate != "unknown" and re.fullmatch(r'[A-Za-z\s\-]+', isolate) and get_country_from_text(isolate) == "unknown":
221
+ found_ethnicity = isolate
222
+ elif context_for_flagging != "unknown": # Use the broader context for ethnicity patterns
223
+ eth_match = re.search(r'(?:population|ethnicity|isolate source):\s*([A-Za-z\s\-]+)', context_for_flagging, re.IGNORECASE)
224
+ if eth_match:
225
+ found_ethnicity = eth_match.group(1).strip()
226
+
227
+ ethnicity = found_ethnicity
228
+
229
+ # --- Determine sample_type (ancient/modern) ---
230
+ if context_for_flagging:
231
+ sample_type, explain = detect_ancient_flag(context_for_flagging)
232
+ features_context += feature_context + "\n"
233
+ break
234
+
235
+ if specific_location != "unknown" and specific_location.lower() == country.lower():
236
+ specific_location = "unknown"
237
+ if not features_context: features_context = "unknown"
238
+ return {"country": country.lower(),
239
+ "specific_location": specific_location.lower(),
240
+ "ethnicity": ethnicity.lower(),
241
+ "sample_type": sample_type.lower(),
242
+ "collection_date": collection_date,
243
+ "isolate": isolate,
244
+ "title": title,
245
+ "doi": doi,
246
+ "pubmed_id": pubmed_id,
247
+ "all_features": features_context}
248
+
249
+ except:
250
+ print(f"Error fetching NCBI data for {accession_number}")
251
+ return {"country": "unknown",
252
+ "specific_location": "unknown",
253
+ "ethnicity": "unknown",
254
+ "sample_type": "unknown",
255
+ "collection_date": "unknown",
256
+ "isolate": "unknown",
257
+ "title": "unknown",
258
+ "doi": "unknown",
259
+ "pubmed_id": None,
260
+ "all_features": "unknown"}
261
+
262
+ # --- Helper function for country matching (re-defined from main code to be self-contained) ---
263
+ _country_keywords = {
264
+ "thailand": "Thailand", "laos": "Laos", "cambodia": "Cambodia", "myanmar": "Myanmar",
265
+ "philippines": "Philippines", "indonesia": "Indonesia", "malaysia": "Malaysia",
266
+ "china": "China", "chinese": "China", "india": "India", "taiwan": "Taiwan",
267
+ "vietnam": "Vietnam", "russia": "Russia", "siberia": "Russia", "nepal": "Nepal",
268
+ "japan": "Japan", "sumatra": "Indonesia", "borneu": "Indonesia",
269
+ "yunnan": "China", "tibet": "China", "northern mindanao": "Philippines",
270
+ "west malaysia": "Malaysia", "north thailand": "Thailand", "central thailand": "Thailand",
271
+ "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
272
+ "central india": "India", "east india": "India", "northeast india": "India",
273
+ "south sibera": "Russia", "mongolia": "China", "beijing": "China", "south korea": "South Korea",
274
+ "north asia": "unknown", "southeast asia": "unknown", "east asia": "unknown"
275
+ }
276
+
277
+ def get_country_from_text(text):
278
+ text_lower = text.lower()
279
+ for keyword, country in _country_keywords.items():
280
+ if keyword in text_lower:
281
+ return country
282
+ return "unknown"
283
+ # The result will be seen as manualLink for the function get_paper_text
284
+ # def search_google_custom(query, max_results=3):
285
+ # # query should be the title from ncbi or paper/source title
286
+ # GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
287
+ # GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
288
+ # endpoint = os.environ["SEARCH_ENDPOINT"]
289
+ # params = {
290
+ # "key": GOOGLE_CSE_API_KEY,
291
+ # "cx": GOOGLE_CSE_CX,
292
+ # "q": query,
293
+ # "num": max_results
294
+ # }
295
+ # try:
296
+ # response = requests.get(endpoint, params=params)
297
+ # if response.status_code == 429:
298
+ # print("Rate limit hit. Try again later.")
299
+ # return []
300
+ # response.raise_for_status()
301
+ # data = response.json().get("items", [])
302
+ # return [item.get("link") for item in data if item.get("link")]
303
+ # except Exception as e:
304
+ # print("Google CSE error:", e)
305
+ # return []
306
+
307
+ def search_google_custom(query, max_results=3):
308
+ # query should be the title from ncbi or paper/source title
309
+ GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY"]
310
+ GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX"]
311
+ endpoint = os.environ["SEARCH_ENDPOINT"]
312
+ params = {
313
+ "key": GOOGLE_CSE_API_KEY,
314
+ "cx": GOOGLE_CSE_CX,
315
+ "q": query,
316
+ "num": max_results
317
+ }
318
+ try:
319
+ response = requests.get(endpoint, params=params)
320
+ if response.status_code == 429:
321
+ print("Rate limit hit. Try again later.")
322
+ print("try with back up account")
323
+ try:
324
+ return search_google_custom_backup(query, max_results)
325
+ except:
326
+ return []
327
+ response.raise_for_status()
328
+ data = response.json().get("items", [])
329
+ return [item.get("link") for item in data if item.get("link")]
330
+ except Exception as e:
331
+ print("Google CSE error:", e)
332
+ return []
333
+
334
+ def search_google_custom_backup(query, max_results=3):
335
+ # query should be the title from ncbi or paper/source title
336
+ GOOGLE_CSE_API_KEY = os.environ["GOOGLE_CSE_API_KEY_BACKUP"]
337
+ GOOGLE_CSE_CX = os.environ["GOOGLE_CSE_CX_BACKUP"]
338
+ endpoint = os.environ["SEARCH_ENDPOINT"]
339
+ params = {
340
+ "key": GOOGLE_CSE_API_KEY,
341
+ "cx": GOOGLE_CSE_CX,
342
+ "q": query,
343
+ "num": max_results
344
+ }
345
+ try:
346
+ response = requests.get(endpoint, params=params)
347
+ if response.status_code == 429:
348
+ print("Rate limit hit. Try again later.")
349
+ return []
350
+ response.raise_for_status()
351
+ data = response.json().get("items", [])
352
+ return [item.get("link") for item in data if item.get("link")]
353
+ except Exception as e:
354
+ print("Google CSE error:", e)
355
+ return []
356
+ # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing
357
+ # Step 3.1: Extract Text
358
+ # sub: download excel file
359
+ def download_excel_file(url, save_path="temp.xlsx"):
360
+ if "view.officeapps.live.com" in url:
361
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
362
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
363
+ response = requests.get(real_url)
364
+ with open(save_path, "wb") as f:
365
+ f.write(response.content)
366
+ return save_path
367
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
368
+ response = requests.get(url)
369
+ response.raise_for_status() # Raises error if download fails
370
+ with open(save_path, "wb") as f:
371
+ f.write(response.content)
372
+ return save_path
373
+ else:
374
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
375
+ return url
376
+ def get_paper_text(doi,id,manualLinks=None):
377
+ # create the temporary folder to contain the texts
378
+ folder_path = Path("data/"+str(id))
379
+ if not folder_path.exists():
380
+ cmd = f'mkdir data/{id}'
381
+ result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
382
+ print("data/"+str(id) +" created.")
383
+ else:
384
+ print("data/"+str(id) +" already exists.")
385
+ saveLinkFolder = "data/"+id
386
+
387
+ link = 'https://doi.org/' + doi
388
+ '''textsToExtract = { "doiLink":"paperText"
389
+ "file1.pdf":"text1",
390
+ "file2.doc":"text2",
391
+ "file3.xlsx":excelText3'''
392
+ textsToExtract = {}
393
+ # get the file to create listOfFile for each id
394
+ html = extractHTML.HTML("",link)
395
+ jsonSM = html.getSupMaterial()
396
+ text = ""
397
+ links = [link] + sum((jsonSM[key] for key in jsonSM),[])
398
+ if manualLinks != None:
399
+ links += manualLinks
400
+ for l in links:
401
+ # get the main paper
402
+ name = l.split("/")[-1]
403
+ file_path = folder_path / name
404
+ if l == link:
405
+ text = html.getListSection()
406
+ textsToExtract[link] = text
407
+ elif l.endswith(".pdf"):
408
+ if file_path.is_file():
409
+ l = saveLinkFolder + "/" + name
410
+ print("File exists.")
411
+ p = pdf.PDF(l,saveLinkFolder,doi)
412
+ f = p.openPDFFile()
413
+ pdf_path = saveLinkFolder + "/" + l.split("/")[-1]
414
+ doc = fitz.open(pdf_path)
415
+ text = "\n".join([page.get_text() for page in doc])
416
+ textsToExtract[l] = text
417
+ elif l.endswith(".doc") or l.endswith(".docx"):
418
+ d = wordDoc.wordDoc(l,saveLinkFolder)
419
+ text = d.extractTextByPage()
420
+ textsToExtract[l] = text
421
+ elif l.split(".")[-1].lower() in "xlsx":
422
+ wc = word2vec.word2Vec()
423
+ # download excel file if it not downloaded yet
424
+ savePath = saveLinkFolder +"/"+ l.split("/")[-1]
425
+ excelPath = download_excel_file(l, savePath)
426
+ corpus = wc.tableTransformToCorpusText([],excelPath)
427
+ text = ''
428
+ for c in corpus:
429
+ para = corpus[c]
430
+ for words in para:
431
+ text += " ".join(words)
432
+ textsToExtract[l] = text
433
+ # delete folder after finishing getting text
434
+ #cmd = f'rm -r data/{id}'
435
+ #result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
436
+ return textsToExtract
437
+ # Step 3.2: Extract context
438
+ def extract_context(text, keyword, window=500):
439
+ # firstly try accession number
440
+ idx = text.find(keyword)
441
+ if idx == -1:
442
+ return "Sample ID not found."
443
+ return text[max(0, idx-window): idx+window]
444
+ def extract_relevant_paragraphs(text, accession, keep_if=None, isolate=None):
445
+ if keep_if is None:
446
+ keep_if = ["sample", "method", "mtdna", "sequence", "collected", "dataset", "supplementary", "table"]
447
+
448
+ outputs = ""
449
+ text = text.lower()
450
+
451
+ # If isolate is provided, prioritize paragraphs that mention it
452
+ # If isolate is provided, prioritize paragraphs that mention it
453
+ if accession and accession.lower() in text:
454
+ if extract_context(text, accession.lower(), window=700) != "Sample ID not found.":
455
+ outputs += extract_context(text, accession.lower(), window=700)
456
+ if isolate and isolate.lower() in text:
457
+ if extract_context(text, isolate.lower(), window=700) != "Sample ID not found.":
458
+ outputs += extract_context(text, isolate.lower(), window=700)
459
+ for keyword in keep_if:
460
+ para = extract_context(text, keyword)
461
+ if para and para not in outputs:
462
+ outputs += para + "\n"
463
+ return outputs
464
+ # Step 4: Classification for now (demo purposes)
465
+ # 4.1: Using a HuggingFace model (question-answering)
466
+ def infer_fromQAModel(context, question="Where is the mtDNA sample from?"):
467
+ try:
468
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
469
+ result = qa({"context": context, "question": question})
470
+ return result.get("answer", "Unknown")
471
+ except Exception as e:
472
+ return f"Error: {str(e)}"
473
+
474
+ # 4.2: Infer from haplogroup
475
+ # Load pre-trained spaCy model for NER
476
+ try:
477
+ nlp = spacy.load("en_core_web_sm")
478
+ except OSError:
479
+ download("en_core_web_sm")
480
+ nlp = spacy.load("en_core_web_sm")
481
+
482
+ # Define the haplogroup-to-region mapping (simple rule-based)
483
+ import csv
484
+
485
+ def load_haplogroup_mapping(csv_path):
486
+ mapping = {}
487
+ with open(csv_path) as f:
488
+ reader = csv.DictReader(f)
489
+ for row in reader:
490
+ mapping[row["haplogroup"]] = [row["region"],row["source"]]
491
+ return mapping
492
+
493
+ # Function to extract haplogroup from the text
494
+ def extract_haplogroup(text):
495
+ match = re.search(r'\bhaplogroup\s+([A-Z][0-9a-z]*)\b', text)
496
+ if match:
497
+ submatch = re.match(r'^[A-Z][0-9]*', match.group(1))
498
+ if submatch:
499
+ return submatch.group(0)
500
+ else:
501
+ return match.group(1) # fallback
502
+ fallback = re.search(r'\b([A-Z][0-9a-z]{1,5})\b', text)
503
+ if fallback:
504
+ return fallback.group(1)
505
+ return None
506
+
507
+
508
+ # Function to extract location based on NER
509
+ def extract_location(text):
510
+ doc = nlp(text)
511
+ locations = []
512
+ for ent in doc.ents:
513
+ if ent.label_ == "GPE": # GPE = Geopolitical Entity (location)
514
+ locations.append(ent.text)
515
+ return locations
516
+
517
+ # Function to infer location from haplogroup
518
+ def infer_location_from_haplogroup(haplogroup):
519
+ haplo_map = load_haplogroup_mapping("data/haplogroup_regions_extended.csv")
520
+ return haplo_map.get(haplogroup, ["Unknown","Unknown"])
521
+
522
+ # Function to classify the mtDNA sample
523
+ def classify_mtDNA_sample_from_haplo(text):
524
+ # Extract haplogroup
525
+ haplogroup = extract_haplogroup(text)
526
+ # Extract location based on NER
527
+ locations = extract_location(text)
528
+ # Infer location based on haplogroup
529
+ inferred_location, sourceHaplo = infer_location_from_haplogroup(haplogroup)[0],infer_location_from_haplogroup(haplogroup)[1]
530
+ return {
531
+ "source":sourceHaplo,
532
+ "locations_found_in_context": locations,
533
+ "haplogroup": haplogroup,
534
+ "inferred_location": inferred_location
535
+
536
+ }
537
+ # 4.3 Get from available NCBI
538
+ def infer_location_fromNCBI(accession):
539
+ try:
540
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
541
+ text = handle.read()
542
+ handle.close()
543
+ match = re.search(r'/(geo_loc_name|country|location)\s*=\s*"([^"]+)"', text)
544
+ if match:
545
+ return match.group(2), match.group(0) # This is the value like "Brunei"
546
+ return "Not found", "Not found"
547
+
548
+ except Exception as e:
549
+ print("❌ Entrez error:", e)
550
+ return "Not found", "Not found"
551
+
552
+ ### ANCIENT/MODERN FLAG
553
+ from Bio import Entrez
554
+ import re
555
+
556
+ def flag_ancient_modern(accession, textsToExtract, isolate=None):
557
+ """
558
+ Try to classify a sample as Ancient or Modern using:
559
+ 1. NCBI accession (if available)
560
+ 2. Supplementary text or context fallback
561
+ """
562
+ context = ""
563
+ label, explain = "", ""
564
+
565
+ try:
566
+ # Check if we can fetch metadata from NCBI using the accession
567
+ handle = Entrez.efetch(db="nuccore", id=accession, rettype="medline", retmode="text")
568
+ text = handle.read()
569
+ handle.close()
570
+
571
+ isolate_source = re.search(r'/(isolation_source)\s*=\s*"([^"]+)"', text)
572
+ if isolate_source:
573
+ context += isolate_source.group(0) + " "
574
+
575
+ specimen = re.search(r'/(specimen|specimen_voucher)\s*=\s*"([^"]+)"', text)
576
+ if specimen:
577
+ context += specimen.group(0) + " "
578
+
579
+ if context.strip():
580
+ label, explain = detect_ancient_flag(context)
581
+ if label!="Unknown":
582
+ return label, explain + " from NCBI\n(" + context + ")"
583
+
584
+ # If no useful NCBI metadata, check supplementary texts
585
+ if textsToExtract:
586
+ labels = {"modern": [0, ""], "ancient": [0, ""], "unknown": 0}
587
+
588
+ for source in textsToExtract:
589
+ text_block = textsToExtract[source]
590
+ context = extract_relevant_paragraphs(text_block, accession, isolate=isolate) # Reduce to informative paragraph(s)
591
+ label, explain = detect_ancient_flag(context)
592
+
593
+ if label == "Ancient":
594
+ labels["ancient"][0] += 1
595
+ labels["ancient"][1] += f"{source}:\n{explain}\n\n"
596
+ elif label == "Modern":
597
+ labels["modern"][0] += 1
598
+ labels["modern"][1] += f"{source}:\n{explain}\n\n"
599
+ else:
600
+ labels["unknown"] += 1
601
+
602
+ if max(labels["modern"][0],labels["ancient"][0]) > 0:
603
+ if labels["modern"][0] > labels["ancient"][0]:
604
+ return "Modern", labels["modern"][1]
605
+ else:
606
+ return "Ancient", labels["ancient"][1]
607
+ else:
608
+ return "Unknown", "No strong keywords detected"
609
+ else:
610
+ print("No DOI or PubMed ID available for inference.")
611
+ return "", ""
612
+
613
+ except Exception as e:
614
+ print("Error:", e)
615
+ return "", ""
616
+
617
+
618
+ def detect_ancient_flag(context_snippet):
619
+ context = context_snippet.lower()
620
+
621
+ ancient_keywords = [
622
+ "ancient", "archaeological", "prehistoric", "neolithic", "mesolithic", "paleolithic",
623
+ "bronze age", "iron age", "burial", "tomb", "skeleton", "14c", "radiocarbon", "carbon dating",
624
+ "postmortem damage", "udg treatment", "adna", "degradation", "site", "excavation",
625
+ "archaeological context", "temporal transect", "population replacement", "cal bp", "calbp", "carbon dated"
626
+ ]
627
+
628
+ modern_keywords = [
629
+ "modern", "hospital", "clinical", "consent","blood","buccal","unrelated", "blood sample","buccal sample","informed consent", "donor", "healthy", "patient",
630
+ "genotyping", "screening", "medical", "cohort", "sequencing facility", "ethics approval",
631
+ "we analysed", "we analyzed", "dataset includes", "new sequences", "published data",
632
+ "control cohort", "sink population", "genbank accession", "sequenced", "pipeline",
633
+ "bioinformatic analysis", "samples from", "population genetics", "genome-wide data", "imr collection"
634
+ ]
635
+
636
+ ancient_hits = [k for k in ancient_keywords if k in context]
637
+ modern_hits = [k for k in modern_keywords if k in context]
638
+
639
+ if ancient_hits and not modern_hits:
640
+ return "Ancient", f"Flagged as ancient due to keywords: {', '.join(ancient_hits)}"
641
+ elif modern_hits and not ancient_hits:
642
+ return "Modern", f"Flagged as modern due to keywords: {', '.join(modern_hits)}"
643
+ elif ancient_hits and modern_hits:
644
+ if len(ancient_hits) >= len(modern_hits):
645
+ return "Ancient", f"Mixed context, leaning ancient due to: {', '.join(ancient_hits)}"
646
+ else:
647
+ return "Modern", f"Mixed context, leaning modern due to: {', '.join(modern_hits)}"
648
+
649
+ # Fallback to QA
650
+ answer = infer_fromQAModel(context, question="Are the mtDNA samples ancient or modern? Explain why.")
651
+ if answer.startswith("Error"):
652
+ return "Unknown", answer
653
+ if "ancient" in answer.lower():
654
+ return "Ancient", f"Leaning ancient based on QA: {answer}"
655
+ elif "modern" in answer.lower():
656
+ return "Modern", f"Leaning modern based on QA: {answer}"
657
+ else:
658
+ return "Unknown", f"No strong keywords or QA clues. QA said: {answer}"
659
+
660
+ # STEP 5: Main pipeline: accession -> 1. get pubmed id and isolate -> 2. get doi -> 3. get text -> 4. prediction -> 5. output: inferred location + explanation + confidence score
661
+ def classify_sample_location(accession):
662
+ outputs = {}
663
+ keyword, context, location, qa_result, haplo_result = "", "", "", "", ""
664
+ # Step 1: get pubmed id and isolate
665
+ pubmedID, isolate = get_info_from_accession(accession)
666
+ '''if not pubmedID:
667
+ return {"error": f"Could not retrieve PubMed ID for accession {accession}"}'''
668
+ if not isolate:
669
+ isolate = "UNKNOWN_ISOLATE"
670
+ # Step 2: get doi
671
+ doi = get_doi_from_pubmed_id(pubmedID)
672
+ '''if not doi:
673
+ return {"error": "DOI not found for this accession. Cannot fetch paper or context."}'''
674
+ # Step 3: get text
675
+ '''textsToExtract = { "doiLink":"paperText"
676
+ "file1.pdf":"text1",
677
+ "file2.doc":"text2",
678
+ "file3.xlsx":excelText3'''
679
+ if doi and pubmedID:
680
+ textsToExtract = get_paper_text(doi,pubmedID)
681
+ else: textsToExtract = {}
682
+ '''if not textsToExtract:
683
+ return {"error": f"No texts extracted for DOI {doi}"}'''
684
+ if isolate not in [None, "UNKNOWN_ISOLATE"]:
685
+ label, explain = flag_ancient_modern(accession,textsToExtract,isolate)
686
+ else:
687
+ label, explain = flag_ancient_modern(accession,textsToExtract)
688
+ # Step 4: prediction
689
+ outputs[accession] = {}
690
+ outputs[isolate] = {}
691
+ # 4.0 Infer from NCBI
692
+ location, outputNCBI = infer_location_fromNCBI(accession)
693
+ NCBI_result = {
694
+ "source": "NCBI",
695
+ "sample_id": accession,
696
+ "predicted_location": location,
697
+ "context_snippet": outputNCBI}
698
+ outputs[accession]["NCBI"]= {"NCBI": NCBI_result}
699
+ if textsToExtract:
700
+ long_text = ""
701
+ for key in textsToExtract:
702
+ text = textsToExtract[key]
703
+ # try accession number first
704
+ outputs[accession][key] = {}
705
+ keyword = accession
706
+ context = extract_context(text, keyword, window=500)
707
+ # 4.1: Using a HuggingFace model (question-answering)
708
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
709
+ qa_result = {
710
+ "source": key,
711
+ "sample_id": keyword,
712
+ "predicted_location": location,
713
+ "context_snippet": context
714
+ }
715
+ outputs[keyword][key]["QAModel"] = qa_result
716
+ # 4.2: Infer from haplogroup
717
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
718
+ outputs[keyword][key]["haplogroup"] = haplo_result
719
+ # try isolate
720
+ keyword = isolate
721
+ outputs[isolate][key] = {}
722
+ context = extract_context(text, keyword, window=500)
723
+ # 4.1.1: Using a HuggingFace model (question-answering)
724
+ location = infer_fromQAModel(context, question=f"Where is the mtDNA sample {keyword} from?")
725
+ qa_result = {
726
+ "source": key,
727
+ "sample_id": keyword,
728
+ "predicted_location": location,
729
+ "context_snippet": context
730
+ }
731
+ outputs[keyword][key]["QAModel"] = qa_result
732
+ # 4.2.1: Infer from haplogroup
733
+ haplo_result = classify_mtDNA_sample_from_haplo(context)
734
+ outputs[keyword][key]["haplogroup"] = haplo_result
735
+ # add long text
736
+ long_text += text + ". \n"
737
+ # 4.3: UpgradeClassify
738
+ # try sample_id as accession number
739
+ sample_id = accession
740
+ if sample_id:
741
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
742
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
743
+ if locations!="No clear location found in top matches":
744
+ outputs[sample_id]["upgradeClassifier"] = {}
745
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
746
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
747
+ "sample_id": sample_id,
748
+ "predicted_location": ", ".join(locations),
749
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
750
+ }
751
+ # try sample_id as isolate name
752
+ sample_id = isolate
753
+ if sample_id:
754
+ filtered_context = filter_context_for_sample(sample_id.upper(), long_text, window_size=1)
755
+ locations = infer_location_for_sample(sample_id.upper(), filtered_context)
756
+ if locations!="No clear location found in top matches":
757
+ outputs[sample_id]["upgradeClassifier"] = {}
758
+ outputs[sample_id]["upgradeClassifier"]["upgradeClassifier"] = {
759
+ "source": "From these sources combined: "+ ", ".join(list(textsToExtract.keys())),
760
+ "sample_id": sample_id,
761
+ "predicted_location": ", ".join(locations),
762
+ "context_snippep": "First 1000 words: \n"+ filtered_context[:1000]
763
+ }
764
+ return outputs, label, explain
core/pipeline.py ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # test1: MJ17 direct
2
+ # test2: "A1YU101" thailand cross-ref
3
+ # test3: "EBK109" thailand cross-ref
4
+ # test4: "OQ731952"/"BST115" for search query title: "South Asian maternal and paternal lineages in southern Thailand and"
5
+ import os, io, time, re, json
6
+ import pandas as pd
7
+ import subprocess
8
+ import multiprocessing
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional
11
+
12
+
13
+ import google.generativeai as genai
14
+
15
+
16
+ # Google Drive (optional)
17
+ from google.oauth2.service_account import Credentials
18
+ from googleapiclient.discovery import build
19
+ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
20
+
21
+ import gspread
22
+ from oauth2client.service_account import ServiceAccountCredentials
23
+
24
+ # ---- core modules (must exist in your project) ----
25
+ import core.mtdna_classifier as mtdna_classifier
26
+ import core.data_preprocess as data_preprocess
27
+ import core.model as model
28
+ import core.smart_fallback as smart_fallback
29
+ import core.standardize_location as standardize_location
30
+ from core.NER.html import extractHTML
31
+ from core.drive_utils import *
32
+
33
+ # def run_with_timeout(func, args=(), kwargs={}, timeout=20):
34
+ # """
35
+ # Runs `func` with timeout in seconds. Kills if it exceeds.
36
+ # Returns: (success, result or None)
37
+ # """
38
+ # def wrapper(q, *args, **kwargs):
39
+ # try:
40
+ # q.put(func(*args, **kwargs))
41
+ # except Exception as e:
42
+ # q.put(e)
43
+
44
+ # q = multiprocessing.Queue()
45
+ # p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
46
+ # p.start()
47
+ # p.join(timeout)
48
+
49
+ # if p.is_alive():
50
+ # p.terminate()
51
+ # p.join()
52
+ # print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
53
+ # return False, None
54
+ # else:
55
+ # result = q.get()
56
+ # if isinstance(result, Exception):
57
+ # raise result
58
+ # return True, result
59
+ # def run_with_timeout(func, args=(), kwargs={}, timeout=30):
60
+ # import concurrent.futures
61
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
62
+ # future = executor.submit(func, *args, **kwargs)
63
+ # try:
64
+ # return True, future.result(timeout=timeout)
65
+ # except concurrent.futures.TimeoutError:
66
+ # print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
67
+ # return False, None
68
+
69
+ def run_with_timeout(func, args=(), kwargs={}, timeout=30):
70
+ def wrapper(q, *args, **kwargs):
71
+ try:
72
+ result = func(*args, **kwargs)
73
+ q.put((True, result))
74
+ except Exception as e:
75
+ q.put((False, e))
76
+
77
+ q = multiprocessing.Queue()
78
+ p = multiprocessing.Process(target=wrapper, args=(q, *args), kwargs=kwargs)
79
+ p.start()
80
+ p.join(timeout)
81
+
82
+ if p.is_alive():
83
+ p.terminate()
84
+ p.join()
85
+ print(f"⏱️ Timeout exceeded ({timeout} sec) — function killed.")
86
+ return False, None
87
+
88
+ if not q.empty():
89
+ success, result = q.get()
90
+ if success:
91
+ return True, result
92
+ else:
93
+ raise result # re-raise exception if needed
94
+
95
+ return False, None
96
+
97
+
98
+
99
+ def time_it(func, *args, **kwargs):
100
+ """
101
+ Measure how long a function takes to run and return its result + time.
102
+ """
103
+ start = time.time()
104
+ result = func(*args, **kwargs)
105
+ end = time.time()
106
+ elapsed = end - start
107
+ print(f"⏱️ '{func.__name__}' took {elapsed:.3f} seconds")
108
+ return result, elapsed
109
+ # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
110
+
111
+ def unique_preserve_order(seq):
112
+ seen = set()
113
+ return [x for x in seq if not (x in seen or seen.add(x))]
114
+ # Main execution
115
+ def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save_df=None):
116
+ # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
117
+ # there can be one accession number in the accessions
118
+ # Prices are per 1,000 tokens
119
+ # Before each big step:
120
+ if stop_flag is not None and stop_flag.value:
121
+ print(f"🛑 Stop detected before starting {accession}, aborting early...")
122
+ return {}
123
+ # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
124
+ # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
125
+ # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
126
+ # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
127
+ PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
128
+ PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
129
+
130
+ # Embedding-001 pricing per 1,000 input tokens
131
+ PRICE_PER_1K_EMBEDDING_INPUT = 0.00015 # $0.15 per 1M input tokens
132
+ if not accessions:
133
+ print("no input")
134
+ return None
135
+ else:
136
+ accs_output = {}
137
+ #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
138
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
139
+ for acc in accessions:
140
+ print("start gemini: ", acc)
141
+ start = time.time()
142
+ total_cost_title = 0
143
+ jsonSM, links, article_text = {},[], ""
144
+ acc_score = { "isolate": "",
145
+ "country":{},
146
+ "sample_type":{},
147
+ #"specific_location":{},
148
+ #"ethnicity":{},
149
+ "query_cost":total_cost_title,
150
+ "time_cost":None,
151
+ "source":links,
152
+ "file_chunk":"",
153
+ "file_all_output":""}
154
+ if niche_cases:
155
+ for niche in niche_cases:
156
+ acc_score[niche] = {}
157
+
158
+ meta = mtdna_classifier.fetch_ncbi_metadata(acc)
159
+ country, spe_loc, ethnic, sample_type, col_date, iso, title, doi, pudID, features = meta["country"], meta["specific_location"], meta["ethnicity"], meta["sample_type"], meta["collection_date"], meta["isolate"], meta["title"], meta["doi"], meta["pubmed_id"], meta["all_features"]
160
+ acc_score["isolate"] = iso
161
+ print("meta: ",meta)
162
+ meta_expand = smart_fallback.fetch_ncbi(acc)
163
+ print("meta expand: ", meta_expand)
164
+ # set up step: create the folder to save document
165
+ chunk, all_output = "",""
166
+ if pudID:
167
+ id = str(pudID)
168
+ saveTitle = title
169
+ else:
170
+ try:
171
+ author_name = meta_expand["authors"].split(',')[0] # Use last name only
172
+ except:
173
+ author_name = meta_expand["authors"]
174
+ saveTitle = title + "_" + col_date + "_" + author_name
175
+ if title.lower() == "unknown" and col_date.lower()=="unknown" and author_name.lower() == "unknown":
176
+ saveTitle += "_" + acc
177
+ id = "DirectSubmission"
178
+ # folder_path = Path("/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id))
179
+ # if not folder_path.exists():
180
+ # cmd = f'mkdir /content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/{id}'
181
+ # result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
182
+ # print("data/"+str(id) +" created.")
183
+ # else:
184
+ # print("data/"+str(id) +" already exists.")
185
+ # saveLinkFolder = "/content/drive/MyDrive/CollectData/MVP/mtDNA-Location-Classifier/data/"+str(id)
186
+ # parent_folder_id = get_or_create_drive_folder(GDRIVE_PARENT_FOLDER_NAME)
187
+ # data_folder_id = get_or_create_drive_folder(GDRIVE_DATA_FOLDER_NAME, parent_id=parent_folder_id)
188
+ # sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
189
+ data_folder_id = GDRIVE_DATA_FOLDER_NAME # Use the shared folder directly
190
+ sample_folder_id = get_or_create_drive_folder(str(id), parent_id=data_folder_id)
191
+ print("sample folder id: ", sample_folder_id)
192
+
193
+ # Define document names
194
+ if len(saveTitle) > 50:
195
+ saveName = saveTitle[:50]
196
+ saveName = saveName.replace(" ", "_")
197
+ chunk_filename = f"{saveName}_merged_document.docx"
198
+ all_filename = f"{saveName}_all_merged_document.docx"
199
+ else:
200
+ saveName = saveTitle.replace(" ", "_")
201
+ chunk_filename = f"{saveName}_merged_document.docx"
202
+ all_filename = f"{saveName}_all_merged_document.docx"
203
+ print("chunk file name and all filename: ", chunk_filename, all_filename)
204
+ # Define local temp paths for reading/writing
205
+ # import tempfile
206
+ # tmp_dir = tempfile.mkdtemp()
207
+ LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
208
+ os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
209
+ file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
210
+ file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
211
+ # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
212
+ # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
213
+ if stop_flag is not None and stop_flag.value:
214
+ print(f"🛑 Stop processing {accession}, aborting early...")
215
+ return {}
216
+ print("this is file chunk path: ", file_chunk_path)
217
+ chunk_id = find_drive_file(chunk_filename, sample_folder_id)
218
+ all_id = find_drive_file(all_filename, sample_folder_id)
219
+
220
+ if chunk_id and all_id:
221
+ print("✅ Files already exist in Google Drive. Downloading them...")
222
+ chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
223
+ all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
224
+ acc_score["file_chunk"] = str(chunk_filename)
225
+ acc_score["file_all_output"] = str(all_filename)
226
+ print("chunk_id and all_id: ")
227
+ print(chunk_id, all_id)
228
+ print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
229
+ file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
230
+ print("📄 Name:", file["name"])
231
+ print("📁 Parent folder ID:", file["parents"][0])
232
+ print("🔗 View link:", file["webViewLink"])
233
+
234
+
235
+ # Read and parse these into `chunk` and `all_output`
236
+ else:
237
+ # 🔥 Remove any stale local copies
238
+ if os.path.exists(file_chunk_path):
239
+ os.remove(file_chunk_path)
240
+ print(f"🗑️ Removed stale: {file_chunk_path}")
241
+ if os.path.exists(file_all_path):
242
+ os.remove(file_all_path)
243
+ print(f"🗑️ Removed stale: {file_all_path}")
244
+ # 🔥 Remove the local file first if it exists
245
+ # if os.path.exists(file_chunk_path):
246
+ # os.remove(file_chunk_path)
247
+ # print("remove chunk path")
248
+ # if os.path.exists(file_all_path):
249
+ # os.remove(file_all_path)
250
+ # print("remove all path")
251
+ # Try to download if already exists on Drive
252
+ chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
253
+ all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
254
+ print("chunk exist: ", chunk_exists)
255
+ # first way: ncbi method
256
+ print("country.lower: ",country.lower())
257
+ if country.lower() != "unknown":
258
+ stand_country = standardize_location.smart_country_lookup(country.lower())
259
+ print("stand_country: ", stand_country)
260
+ if stand_country.lower() != "not found":
261
+ acc_score["country"][stand_country.lower()] = ["ncbi"]
262
+ else: acc_score["country"][country.lower()] = ["ncbi"]
263
+ # if spe_loc.lower() != "unknown":
264
+ # acc_score["specific_location"][spe_loc.lower()] = ["ncbi"]
265
+ # if ethnic.lower() != "unknown":
266
+ # acc_score["ethnicity"][ethnic.lower()] = ["ncbi"]
267
+ if sample_type.lower() != "unknown":
268
+ acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
269
+ # second way: LLM model
270
+ # Preprocess the input token
271
+ print(acc_score)
272
+ accession, isolate = None, None
273
+ if acc != "unknown": accession = acc
274
+ if iso != "unknown": isolate = iso
275
+ if stop_flag is not None and stop_flag.value:
276
+ print(f"🛑 Stop processing {accession}, aborting early...")
277
+ return {}
278
+ # check doi first
279
+ print("chunk filename: ", chunk_filename)
280
+ if chunk_exists:
281
+ print("File chunk exists!")
282
+ if not chunk:
283
+ print("start to get chunk")
284
+ text, table, document_title = model.read_docx_text(file_chunk_path)
285
+ chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
286
+ if str(chunk_filename) != "":
287
+ print("first time have chunk path at chunk exist: ", str(chunk_filename))
288
+ acc_score["file_chunk"] = str(chunk_filename)
289
+ if all_exists:
290
+ print("File all output exists!")
291
+ if not all_output:
292
+ text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
293
+ all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
294
+ if str(all_filename) != "":
295
+ print("first time have all path at all exist: ", str(all_filename))
296
+ acc_score["file_all_output"] = str(all_filename)
297
+ print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
298
+ if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
299
+ if doi != "unknown":
300
+ link = 'https://doi.org/' + doi
301
+ # get the file to create listOfFile for each id
302
+ print("link of doi: ", link)
303
+ html = extractHTML.HTML("",link)
304
+ jsonSM = html.getSupMaterial()
305
+ article_text = html.getListSection()
306
+ if article_text:
307
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
308
+ links.append(link)
309
+ if jsonSM:
310
+ links += sum((jsonSM[key] for key in jsonSM),[])
311
+ # no doi then google custom search api
312
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
313
+ # might find the article
314
+ print("no article text, start tem link")
315
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
316
+ tem_links = smart_fallback.smart_google_search(meta_expand)
317
+ print("tem links: ", tem_links)
318
+ tem_link_acc = smart_fallback.google_accession_search(acc)
319
+ tem_links += tem_link_acc
320
+ tem_links = unique_preserve_order(tem_links)
321
+ print("tem link before filtering: ", tem_links)
322
+ # filter the quality link
323
+ print("saveLinkFolder as sample folder id: ", sample_folder_id)
324
+ print("start the smart filter link")
325
+ if stop_flag is not None and stop_flag.value:
326
+ print(f"🛑 Stop processing {accession}, aborting early...")
327
+ return {}
328
+ # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
329
+ # if success_process:
330
+ # links = output_process
331
+ # print("yes succeed for smart filter link")
332
+ # else:
333
+ # print("no suceed, fallback to all tem links")
334
+ # links = tem_links
335
+ links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
336
+ print("this is links: ",links)
337
+ links = unique_preserve_order(links)
338
+ acc_score["source"] = links
339
+ else:
340
+ print("inside the try of reusing chunk or all output")
341
+ #print("chunk filename: ", str(chunks_filename))
342
+
343
+ try:
344
+ temp_source = False
345
+ if save_df is not None and not save_df.empty:
346
+ print("save df not none")
347
+ print("chunk file name: ",str(chunk_filename))
348
+ print("all filename: ",str(all_filename))
349
+ if acc_score["file_chunk"]:
350
+ link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
351
+ #link = row["Sources"].iloc[0]
352
+ if "http" in link:
353
+ print("yeah http in save df source")
354
+ acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
355
+ else: # temporary
356
+ print("tempo source")
357
+ #acc_score["source"] = [str(all_filename), str(chunks_filename)]
358
+ temp_source = True
359
+ elif acc_score["file_all_output"]:
360
+ link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
361
+ #link = row["Sources"].iloc[0]
362
+ print(link)
363
+ print("list of link")
364
+ print([x for x in link.split("\n") if x.strip()])
365
+ if "http" in link:
366
+ print("yeah http in save df source")
367
+ acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
368
+ else: # temporary
369
+ print("tempo source")
370
+ #acc_score["source"] = [str(all_filename), str(chunks_filename)]
371
+ temp_source = True
372
+ else: # temporary
373
+ print("tempo source")
374
+ #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
375
+ temp_source = True
376
+ else: # temporary
377
+ print("tempo source")
378
+ #acc_score["source"] = [str(file_all_path), str(file_chunk_path)]
379
+ temp_source = True
380
+ if temp_source:
381
+ print("temp source is true so have to try again search link")
382
+ if doi != "unknown":
383
+ link = 'https://doi.org/' + doi
384
+ # get the file to create listOfFile for each id
385
+ print("link of doi: ", link)
386
+ html = extractHTML.HTML("",link)
387
+ jsonSM = html.getSupMaterial()
388
+ article_text = html.getListSection()
389
+ if article_text:
390
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
391
+ links.append(link)
392
+ if jsonSM:
393
+ links += sum((jsonSM[key] for key in jsonSM),[])
394
+ # no doi then google custom search api
395
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
396
+ # might find the article
397
+ print("no article text, start tem link")
398
+ #tem_links = mtdna_classifier.search_google_custom(title, 2)
399
+ tem_links = smart_fallback.smart_google_search(meta_expand)
400
+ print("tem links: ", tem_links)
401
+ tem_link_acc = smart_fallback.google_accession_search(acc)
402
+ tem_links += tem_link_acc
403
+ tem_links = unique_preserve_order(tem_links)
404
+ print("tem link before filtering: ", tem_links)
405
+ # filter the quality link
406
+ print("saveLinkFolder as sample folder id: ", sample_folder_id)
407
+ print("start the smart filter link")
408
+ if stop_flag is not None and stop_flag.value:
409
+ print(f"🛑 Stop processing {accession}, aborting early...")
410
+ return {}
411
+ # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
412
+ # if success_process:
413
+ # links = output_process
414
+ # print("yes succeed for smart filter link")
415
+ # else:
416
+ # print("no suceed, fallback to all tem links")
417
+ # links = tem_links
418
+ links = smart_fallback.filter_links_by_metadata(tem_links, saveLinkFolder=sample_folder_id, accession=acc, stop_flag=stop_flag)
419
+ print("this is links: ",links)
420
+ links = unique_preserve_order(links)
421
+ acc_score["source"] = links
422
+ except:
423
+ print("except for source")
424
+ acc_score["source"] = []
425
+ # chunk_path = "/"+saveTitle+"_merged_document.docx"
426
+ # all_path = "/"+saveTitle+"_all_merged_document.docx"
427
+ # # if chunk and all output not exist yet
428
+ # file_chunk_path = saveLinkFolder + chunk_path
429
+ # file_all_path = saveLinkFolder + all_path
430
+ # if os.path.exists(file_chunk_path):
431
+ # print("File chunk exists!")
432
+ # if not chunk:
433
+ # text, table, document_title = model.read_docx_text(file_chunk_path)
434
+ # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
435
+ # if os.path.exists(file_all_path):
436
+ # print("File all output exists!")
437
+ # if not all_output:
438
+ # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
439
+ # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
440
+ if stop_flag is not None and stop_flag.value:
441
+ print(f"🛑 Stop processing {accession}, aborting early...")
442
+ return {}
443
+ # print("chunk filename: ", chunk_filename)
444
+ # if chunk_exists:
445
+ # print("File chunk exists!")
446
+ # if not chunk:
447
+ # print("start to get chunk")
448
+ # text, table, document_title = model.read_docx_text(file_chunk_path)
449
+ # chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
450
+ # if str(chunk_filename) != "":
451
+ # print("first time have chunk path at chunk exist: ", str(chunk_filename))
452
+ # acc_score["file_chunk"] = str(chunk_filename)
453
+ # if all_exists:
454
+ # print("File all output exists!")
455
+ # if not all_output:
456
+ # text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
457
+ # all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
458
+ # if str(all_filename) != "":
459
+ # print("first time have all path at all exist: ", str(all_filename))
460
+ # acc_score["file_all_output"] = str(all_filename)
461
+ if not chunk and not all_output:
462
+ print("not chunk and all output")
463
+ # else: check if we can reuse these chunk and all output of existed accession to find another
464
+ if str(chunk_filename) != "":
465
+ print("first time have chunk path: ", str(chunk_filename))
466
+ acc_score["file_chunk"] = str(chunk_filename)
467
+ if str(all_filename) != "":
468
+ print("first time have all path: ", str(all_filename))
469
+ acc_score["file_all_output"] = str(all_filename)
470
+ if links:
471
+ for link in links:
472
+ print(link)
473
+ # if len(all_output) > 1000*1000:
474
+ # all_output = data_preprocess.normalize_for_overlap(all_output)
475
+ # print("after normalizing all output: ", len(all_output))
476
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
477
+ print("break here")
478
+ break
479
+ if iso != "unknown": query_kw = iso
480
+ else: query_kw = acc
481
+ #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
482
+ success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
483
+ if stop_flag is not None and stop_flag.value:
484
+ print(f"🛑 Stop processing {accession}, aborting early...")
485
+ return {}
486
+ if success_process:
487
+ text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
488
+ print("yes succeed for process document")
489
+ else: text_link, tables_link, final_input_link = "", "", ""
490
+ context = data_preprocess.extract_context(final_input_link, query_kw)
491
+ if context != "Sample ID not found.":
492
+ if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
493
+ success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
494
+ if stop_flag is not None and stop_flag.value:
495
+ print(f"🛑 Stop processing {accession}, aborting early...")
496
+ return {}
497
+ if success_chunk:
498
+ chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
499
+ print("yes succeed for chunk")
500
+ else:
501
+ chunk += context
502
+ print("len context: ", len(context))
503
+ print("basic fall back")
504
+ print("len chunk after: ", len(chunk))
505
+ if len(final_input_link) > 1000*1000:
506
+ if context != "Sample ID not found.":
507
+ final_input_link = context
508
+ else:
509
+ final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
510
+ if len(final_input_link) > 1000 *1000:
511
+ final_input_link = final_input_link[:100000]
512
+ if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
513
+ print("Running merge_texts_skipping_overlap with timeout")
514
+ success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
515
+ if stop_flag is not None and stop_flag.value:
516
+ print(f"🛑 Stop processing {accession}, aborting early...")
517
+ return {}
518
+ print("Returned from timeout logic")
519
+ if success:
520
+ all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
521
+ print("yes succeed")
522
+ else:
523
+ print("len all output: ", len(all_output))
524
+ print("len final input link: ", len(final_input_link))
525
+ all_output += final_input_link
526
+ print("len final input: ", len(final_input_link))
527
+ print("basic fall back")
528
+ else:
529
+ print("both/either all output or final link too large more than 100000")
530
+ print("len all output: ", len(all_output))
531
+ print("len final input link: ", len(final_input_link))
532
+ all_output += final_input_link
533
+ print("len final input: ", len(final_input_link))
534
+ print("basic fall back")
535
+ print("len all output after: ", len(all_output))
536
+ #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
537
+ if stop_flag is not None and stop_flag.value:
538
+ print(f"🛑 Stop processing {accession}, aborting early...")
539
+ return {}
540
+ else:
541
+ chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
542
+ all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
543
+ if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
544
+ if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
545
+ if len(all_output) > 1*1024*1024:
546
+ all_output = data_preprocess.normalize_for_overlap(all_output)
547
+ if len(all_output) > 1*1024*1024:
548
+ all_output = all_output[:1*1024*1024]
549
+ print("chunk len: ", len(chunk))
550
+ print("all output len: ", len(all_output))
551
+ data_preprocess.save_text_to_docx(chunk, file_chunk_path)
552
+ data_preprocess.save_text_to_docx(all_output, file_all_path)
553
+ # Later when saving new files
554
+ # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
555
+ # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
556
+
557
+ # Upload to Drive
558
+ result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
559
+ result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
560
+ print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
561
+ print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
562
+ print("here 1")
563
+
564
+ # else:
565
+ # final_input = ""
566
+ # if all_output:
567
+ # final_input = all_output
568
+ # else:
569
+ # if chunk: final_input = chunk
570
+ # #data_preprocess.merge_texts_skipping_overlap(final_input, all_output)
571
+ # if final_input:
572
+ # keywords = []
573
+ # if iso != "unknown": keywords.append(iso)
574
+ # if acc != "unknown": keywords.append(acc)
575
+ # for keyword in keywords:
576
+ # chunkBFS = data_preprocess.get_contextual_sentences_BFS(final_input, keyword)
577
+ # countryDFS, chunkDFS = data_preprocess.get_contextual_sentences_DFS(final_input, keyword)
578
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkDFS)
579
+ # chunk = data_preprocess.merge_texts_skipping_overlap(chunk, chunkBFS)
580
+
581
+ # Define paths for cached RAG assets
582
+ # faiss_index_path = saveLinkFolder+"/faiss_index.bin"
583
+ # document_chunks_path = saveLinkFolder+"/document_chunks.json"
584
+ # structured_lookup_path = saveLinkFolder+"/structured_lookup.json"
585
+ print("here 2")
586
+ faiss_filename = "faiss_index.bin"
587
+ chunks_filename = "document_chunks.json"
588
+ lookup_filename = "structured_lookup.json"
589
+ print("name of faiss: ", faiss_filename)
590
+
591
+ faiss_index_path = os.path.join(LOCAL_TEMP_DIR, faiss_filename)
592
+ document_chunks_path = os.path.join(LOCAL_TEMP_DIR, chunks_filename)
593
+ structured_lookup_path = os.path.join(LOCAL_TEMP_DIR, lookup_filename)
594
+ print("name if faiss path: ", faiss_index_path)
595
+ # 🔥 Remove the local file first if it exists
596
+ print("start faiss id and also the sample folder id is: ", sample_folder_id)
597
+ faiss_id = find_drive_file(faiss_filename, sample_folder_id)
598
+ print("done faiss id")
599
+ document_id = find_drive_file(chunks_filename, sample_folder_id)
600
+ structure_id = find_drive_file(lookup_filename, sample_folder_id)
601
+ if faiss_id and document_id and structure_id:
602
+ print("✅ 3 Files already exist in Google Drive. Downloading them...")
603
+ download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
604
+ download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
605
+ download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
606
+ # Read and parse these into `chunk` and `all_output`
607
+ else:
608
+ "one of id not exist"
609
+ if os.path.exists(faiss_index_path):
610
+ print("faiss index exist and start to remove: ", faiss_index_path)
611
+ os.remove(faiss_index_path)
612
+ if os.path.exists(document_chunks_path):
613
+ os.remove(document_chunks_path)
614
+ if os.path.exists(structured_lookup_path):
615
+ os.remove(structured_lookup_path)
616
+ print("start to download the faiss, chunk, lookup")
617
+
618
+ download_file_from_drive(faiss_filename, sample_folder_id, faiss_index_path)
619
+ download_file_from_drive(chunks_filename, sample_folder_id, document_chunks_path)
620
+ download_file_from_drive(lookup_filename, sample_folder_id, structured_lookup_path)
621
+ try:
622
+ print("try gemini 2.5")
623
+ print("move to load rag")
624
+ master_structured_lookup, faiss_index, document_chunks = model.load_rag_assets(
625
+ faiss_index_path, document_chunks_path, structured_lookup_path
626
+ )
627
+
628
+ global_llm_model_for_counting_tokens = genai.GenerativeModel('gemini-1.5-flash-latest')
629
+ if not all_output:
630
+ if chunk: all_output = chunk
631
+ else: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
632
+ if faiss_index is None:
633
+ print("\nBuilding RAG assets (structured lookup, FAISS index, chunks)...")
634
+ total_doc_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(
635
+ all_output
636
+ ).total_tokens
637
+
638
+ initial_embedding_cost = (total_doc_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
639
+ total_cost_title += initial_embedding_cost
640
+ print(f"Initial one-time embedding cost for '{file_all_path}' ({total_doc_embedding_tokens} tokens): ${initial_embedding_cost:.6f}")
641
+
642
+
643
+ master_structured_lookup, faiss_index, document_chunks, plain_text_content = model.build_vector_index_and_data(
644
+ file_all_path, faiss_index_path, document_chunks_path, structured_lookup_path
645
+ )
646
+ else:
647
+ print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
648
+ plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
649
+ master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
650
+ if stop_flag is not None and stop_flag.value:
651
+ print(f"🛑 Stop processing {accession}, aborting early...")
652
+ return {}
653
+ primary_word = iso
654
+ alternative_word = acc
655
+ print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
656
+ if features.lower() not in all_output.lower():
657
+ all_output += ". NCBI Features: " + features
658
+ # country, sample_type, method_used, ethnic, spe_loc, total_query_cost = model.query_document_info(
659
+ # primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
660
+ # model.call_llm_api, chunk=chunk, all_output=all_output)
661
+ print("this is chunk for the model")
662
+ print(chunk)
663
+ print("this is all output for the model")
664
+ print(all_output)
665
+ if stop_flag is not None and stop_flag.value:
666
+ print(f"🛑 Stop processing {accession}, aborting early...")
667
+ return {}
668
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
669
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
670
+ model.call_llm_api, chunk=chunk, all_output=all_output)
671
+ print("pass query of 2.5")
672
+ except:
673
+ print("try gemini 1.5")
674
+ country, sample_type, ethnic, spe_loc, method_used, country_explanation, sample_type_explanation, ethnicity_explanation, specific_loc_explanation, total_query_cost = model.query_document_info(
675
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
676
+ model.call_llm_api, chunk=chunk, all_output=all_output, model_ai="gemini-1.5-flash-latest")
677
+ print("yeah pass the query of 1.5")
678
+ print("country using ai: ", country)
679
+ print("sample type using ai: ", sample_type)
680
+ # if len(country) == 0: country = "unknown"
681
+ # if len(sample_type) == 0: sample_type = "unknown"
682
+ # if country_explanation: country_explanation = "-"+country_explanation
683
+ # else: country_explanation = ""
684
+ # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
685
+ # else: sample_type_explanation = ""
686
+ if len(country) == 0: country = "unknown"
687
+ if len(sample_type) == 0: sample_type = "unknown"
688
+ if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
689
+ else: country_explanation = ""
690
+ if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
691
+ else: sample_type_explanation = ""
692
+
693
+ if method_used == "unknown": method_used = ""
694
+ if country.lower() != "unknown":
695
+ stand_country = standardize_location.smart_country_lookup(country.lower())
696
+ if stand_country.lower() != "not found":
697
+ if stand_country.lower() in acc_score["country"]:
698
+ if country_explanation:
699
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
700
+ else:
701
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
702
+ else:
703
+ if country.lower() in acc_score["country"]:
704
+ if country_explanation:
705
+ if len(method_used + country_explanation) > 0:
706
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
707
+ else:
708
+ if len(method_used + country_explanation) > 0:
709
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
710
+ # if spe_loc.lower() != "unknown":
711
+ # if spe_loc.lower() in acc_score["specific_location"]:
712
+ # acc_score["specific_location"][spe_loc.lower()].append(method_used)
713
+ # else:
714
+ # acc_score["specific_location"][spe_loc.lower()] = [method_used]
715
+ # if ethnic.lower() != "unknown":
716
+ # if ethnic.lower() in acc_score["ethnicity"]:
717
+ # acc_score["ethnicity"][ethnic.lower()].append(method_used)
718
+ # else:
719
+ # acc_score["ethnicity"][ethnic.lower()] = [method_used]
720
+ if sample_type.lower() != "unknown":
721
+ if sample_type.lower() in acc_score["sample_type"]:
722
+ if len(method_used + sample_type_explanation) > 0:
723
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
724
+ else:
725
+ if len(method_used + sample_type_explanation)> 0:
726
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
727
+ total_cost_title += total_query_cost
728
+ if stop_flag is not None and stop_flag.value:
729
+ print(f"🛑 Stop processing {accession}, aborting early...")
730
+ return {}
731
+ # last resort: combine all information to give all output otherwise unknown
732
+ if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":
733
+ text = ""
734
+ for key in meta_expand:
735
+ text += str(key) + ": " + meta_expand[key] + "\n"
736
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
737
+ text += data_preprocess.normalize_for_overlap(all_output)
738
+ if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
739
+ text += data_preprocess.normalize_for_overlap(chunk)
740
+ text += ". NCBI Features: " + features
741
+ print("this is text for the last resort model")
742
+ print(text)
743
+ country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost = model.query_document_info(
744
+ primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
745
+ model.call_llm_api, chunk=text, all_output=text)
746
+ print("this is last resort results: ")
747
+ print("country: ", country)
748
+ print("sample type: ", sample_type)
749
+ if len(country) == 0: country = "unknown"
750
+ if len(sample_type) == 0: sample_type = "unknown"
751
+ # if country_explanation: country_explanation = "-"+country_explanation
752
+ # else: country_explanation = ""
753
+ # if sample_type_explanation: sample_type_explanation = "-"+sample_type_explanation
754
+ # else: sample_type_explanation = ""
755
+ if country_explanation and country_explanation!="unknown": country_explanation = "-"+country_explanation
756
+ else: country_explanation = ""
757
+ if sample_type_explanation and sample_type_explanation!="unknown": sample_type_explanation = "-"+sample_type_explanation
758
+ else: sample_type_explanation = ""
759
+
760
+ if method_used == "unknown": method_used = ""
761
+ if country.lower() != "unknown":
762
+ stand_country = standardize_location.smart_country_lookup(country.lower())
763
+ if stand_country.lower() != "not found":
764
+ if stand_country.lower() in acc_score["country"]:
765
+ if country_explanation:
766
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
767
+ else:
768
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
769
+ else:
770
+ if country.lower() in acc_score["country"]:
771
+ if country_explanation:
772
+ if len(method_used + country_explanation) > 0:
773
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
774
+ else:
775
+ if len(method_used + country_explanation) > 0:
776
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
777
+ if sample_type.lower() != "unknown":
778
+ if sample_type.lower() in acc_score["sample_type"]:
779
+ if len(method_used + sample_type_explanation) > 0:
780
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
781
+ else:
782
+ if len(method_used + sample_type_explanation)> 0:
783
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
784
+ total_cost_title += total_query_cost
785
+ end = time.time()
786
+ #total_cost_title += total_query_cost
787
+ acc_score["query_cost"] = f"{total_cost_title:.6f}"
788
+ elapsed = end - start
789
+ acc_score["time_cost"] = f"{elapsed:.3f} seconds"
790
+ accs_output[acc] = acc_score
791
+ print(accs_output[acc])
792
+
793
+ return accs_output
core/smart_fallback.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Entrez, Medline
2
+ #import model
3
+ import core.mtdna_classifier
4
+ from core.NER.html import extractHTML
5
+ import core.data_preprocess
6
+ import core.pipeline
7
+ # Setup
8
+ def fetch_ncbi(accession_number):
9
+ try:
10
+ Entrez.email = "your.email@example.com" # Required by NCBI, REPLACE WITH YOUR EMAIL
11
+ handle = Entrez.efetch(db="nucleotide", id=str(accession_number), rettype="gb", retmode="xml")
12
+ record = Entrez.read(handle)
13
+ handle.close()
14
+ outputs = {"authors":"unknown",
15
+ "institution":"unknown",
16
+ "isolate":"unknown",
17
+ "definition":"unknown",
18
+ "title":"unknown",
19
+ "seq_comment":"unknown",
20
+ "collection_date":"unknown" } #'GBSeq_update-date': '25-OCT-2023', 'GBSeq_create-date'
21
+ gb_seq = None
22
+ # Validate record structure: It should be a list with at least one element (a dict)
23
+ if isinstance(record, list) and len(record) > 0:
24
+ if isinstance(record[0], dict):
25
+ gb_seq = record[0]
26
+ else:
27
+ print(f"Warning: record[0] is not a dictionary for {accession_number}. Type: {type(record[0])}")
28
+ # extract collection date
29
+ if "GBSeq_create-date" in gb_seq and outputs["collection_date"]=="unknown":
30
+ outputs["collection_date"] = gb_seq["GBSeq_create-date"]
31
+ else:
32
+ if "GBSeq_update-date" in gb_seq and outputs["collection_date"]=="unknown":
33
+ outputs["collection_date"] = gb_seq["GBSeq_update-date"]
34
+ # extract definition
35
+ if "GBSeq_definition" in gb_seq and outputs["definition"]=="unknown":
36
+ outputs["definition"] = gb_seq["GBSeq_definition"]
37
+ # extract related-reference things
38
+ if "GBSeq_references" in gb_seq:
39
+ for ref in gb_seq["GBSeq_references"]:
40
+ # extract authors
41
+ if "GBReference_authors" in ref and outputs["authors"]=="unknown":
42
+ outputs["authors"] = "and ".join(ref["GBReference_authors"])
43
+ # extract title
44
+ if "GBReference_title" in ref and outputs["title"]=="unknown":
45
+ outputs["title"] = ref["GBReference_title"]
46
+ # extract submitted journal
47
+ if 'GBReference_journal' in ref and outputs["institution"]=="unknown":
48
+ outputs["institution"] = ref['GBReference_journal']
49
+ # extract seq_comment
50
+ if 'GBSeq_comment'in gb_seq and outputs["seq_comment"]=="unknown":
51
+ outputs["seq_comment"] = gb_seq["GBSeq_comment"]
52
+ # extract isolate
53
+ if "GBSeq_feature-table" in gb_seq:
54
+ if 'GBFeature_quals' in gb_seq["GBSeq_feature-table"][0]:
55
+ for ref in gb_seq["GBSeq_feature-table"][0]["GBFeature_quals"]:
56
+ if ref['GBQualifier_name'] == "isolate" and outputs["isolate"]=="unknown":
57
+ outputs["isolate"] = ref["GBQualifier_value"]
58
+ else:
59
+ print(f"Warning: No valid record or empty record list from NCBI for {accession_number}.")
60
+
61
+ # If gb_seq is still None, return defaults
62
+ if gb_seq is None:
63
+ return {"authors":"unknown",
64
+ "institution":"unknown",
65
+ "isolate":"unknown",
66
+ "definition":"unknown",
67
+ "title":"unknown",
68
+ "seq_comment":"unknown",
69
+ "collection_date":"unknown" }
70
+ return outputs
71
+ except:
72
+ print("error in fetching ncbi data")
73
+ return {"authors":"unknown",
74
+ "institution":"unknown",
75
+ "isolate":"unknown",
76
+ "definition":"unknown",
77
+ "title":"unknown",
78
+ "seq_comment":"unknown",
79
+ "collection_date":"unknown" }
80
+ # Fallback if NCBI crashed or cannot find accession on NBCI
81
+ def google_accession_search(accession_id):
82
+ """
83
+ Search for metadata by accession ID using Google Custom Search.
84
+ Falls back to known biological databases and archives.
85
+ """
86
+ queries = [
87
+ f"{accession_id}",
88
+ f"{accession_id} site:ncbi.nlm.nih.gov",
89
+ f"{accession_id} site:pubmed.ncbi.nlm.nih.gov",
90
+ f"{accession_id} site:europepmc.org",
91
+ f"{accession_id} site:researchgate.net",
92
+ f"{accession_id} mtDNA",
93
+ f"{accession_id} mitochondrial DNA"
94
+ ]
95
+
96
+ links = []
97
+ for query in queries:
98
+ search_results = mtdna_classifier.search_google_custom(query, 2)
99
+ for link in search_results:
100
+ if link not in links:
101
+ links.append(link)
102
+ return links
103
+
104
+ # Method 1: Smarter Google
105
+ def smart_google_queries(metadata: dict):
106
+ queries = []
107
+
108
+ # Extract useful fields
109
+ isolate = metadata.get("isolate")
110
+ author = metadata.get("authors")
111
+ institution = metadata.get("institution")
112
+ title = metadata.get("title")
113
+ combined = []
114
+ # Construct queries
115
+ if isolate and isolate!="unknown" and isolate!="Unpublished":
116
+ queries.append(f'"{isolate}" mitochondrial DNA')
117
+ queries.append(f'"{isolate}" site:ncbi.nlm.nih.gov')
118
+
119
+ if author and author!="unknown" and author!="Unpublished":
120
+ # try:
121
+ # author_name = ".".join(author.split(' ')[0].split(".")[:-1]) # Use last name only
122
+ # except:
123
+ # try:
124
+ # author_name = author.split(',')[0] # Use last name only
125
+ # except:
126
+ # author_name = author
127
+ try:
128
+ author_name = author.split(',')[0] # Use last name only
129
+ except:
130
+ author_name = author
131
+ queries.append(f'"{author_name}" mitochondrial DNA')
132
+ queries.append(f'"{author_name}" mtDNA site:researchgate.net')
133
+
134
+ if institution and institution!="unknown" and institution!="Unpublished":
135
+ try:
136
+ short_inst = ",".join(institution.split(',')[:2]) # Take first part of institution
137
+ except:
138
+ try:
139
+ short_inst = institution.split(',')[0]
140
+ except:
141
+ short_inst = institution
142
+ queries.append(f'"{short_inst}" mtDNA sequence')
143
+ #queries.append(f'"{short_inst}" isolate site:nature.com')
144
+ if title and title!='unknown' and title!="Unpublished":
145
+ if title!="Direct Submission":
146
+ queries.append(title)
147
+
148
+ return queries
149
+
150
+ def filter_links_by_metadata(search_results, saveLinkFolder, accession=None, stop_flag=None):
151
+ TRUSTED_DOMAINS = [
152
+ "ncbi.nlm.nih.gov",
153
+ "pubmed.ncbi.nlm.nih.gov",
154
+ "pmc.ncbi.nlm.nih.gov",
155
+ "biorxiv.org",
156
+ "researchgate.net",
157
+ "nature.com",
158
+ "sciencedirect.com"
159
+ ]
160
+ if stop_flag is not None and stop_flag.value:
161
+ print(f"🛑 Stop detected {accession}, aborting early...")
162
+ return []
163
+ def is_trusted_link(link):
164
+ for domain in TRUSTED_DOMAINS:
165
+ if domain in link:
166
+ return True
167
+ return False
168
+ def is_relevant_title_snippet(link, saveLinkFolder, accession=None):
169
+ output = []
170
+ keywords = ["mtDNA", "mitochondrial", "accession", "isolate", "Homo sapiens", "sequence"]
171
+ if accession:
172
+ keywords = [accession] + keywords
173
+ title_snippet = link.lower()
174
+ print("save link folder inside this filter function: ", saveLinkFolder)
175
+ success_process, output_process = pipeline.run_with_timeout(data_preprocess.extract_text,args=(link,saveLinkFolder),timeout=60)
176
+ if stop_flag is not None and stop_flag.value:
177
+ print(f"🛑 Stop detected {accession}, aborting early...")
178
+ return []
179
+ if success_process:
180
+ article_text = output_process
181
+ print("yes succeed for getting article text")
182
+ else:
183
+ print("no suceed, fallback to no link")
184
+ article_text = ""
185
+ #article_text = data_preprocess.extract_text(link,saveLinkFolder)
186
+ print("article text")
187
+ #print(article_text)
188
+ if stop_flag is not None and stop_flag.value:
189
+ print(f"🛑 Stop detected {accession}, aborting early...")
190
+ return []
191
+ try:
192
+ ext = link.split(".")[-1].lower()
193
+ if ext not in ["pdf", "docx", "xlsx"]:
194
+ html = extractHTML.HTML("", link)
195
+ if stop_flag is not None and stop_flag.value:
196
+ print(f"🛑 Stop detected {accession}, aborting early...")
197
+ return []
198
+ jsonSM = html.getSupMaterial()
199
+ if jsonSM:
200
+ output += sum((jsonSM[key] for key in jsonSM), [])
201
+ except Exception:
202
+ pass # continue silently
203
+ for keyword in keywords:
204
+ if keyword.lower() in article_text.lower():
205
+ if link not in output:
206
+ output.append([link,keyword.lower()])
207
+ print("link and keyword for article text: ", link, keyword)
208
+ return output
209
+ if keyword.lower() in title_snippet.lower():
210
+ if link not in output:
211
+ output.append([link,keyword.lower()])
212
+ print("link and keyword for title: ", link, keyword)
213
+ return output
214
+ return output
215
+
216
+ filtered = []
217
+ better_filter = []
218
+ if len(search_results) > 0:
219
+ for link in search_results:
220
+ # if is_trusted_link(link):
221
+ # if link not in filtered:
222
+ # filtered.append(link)
223
+ # else:
224
+ print(link)
225
+ if stop_flag is not None and stop_flag.value:
226
+ print(f"🛑 Stop detected {accession}, aborting early...")
227
+ return []
228
+ if link:
229
+ output_link = is_relevant_title_snippet(link,saveLinkFolder, accession)
230
+ print("output link: ")
231
+ print(output_link)
232
+ for out_link in output_link:
233
+ if isinstance(out_link,list) and len(out_link) > 1:
234
+ print(out_link)
235
+ kw = out_link[1]
236
+ print("kw and acc: ", kw, accession.lower())
237
+ if accession and kw == accession.lower():
238
+ better_filter.append(out_link[0])
239
+ filtered.append(out_link[0])
240
+ else: filtered.append(out_link)
241
+ print("done with link and here is filter: ",filtered)
242
+ if better_filter:
243
+ filtered = better_filter
244
+ return filtered
245
+
246
+ def smart_google_search(metadata):
247
+ queries = smart_google_queries(metadata)
248
+ links = []
249
+ for q in queries:
250
+ #print("\n🔍 Query:", q)
251
+ results = mtdna_classifier.search_google_custom(q,2)
252
+ for link in results:
253
+ #print(f"- {link}")
254
+ if link not in links:
255
+ links.append(link)
256
+ #filter_links = filter_links_by_metadata(links)
257
+ return links
258
+ # Method 2: Prompt LLM better or better ai search api with all
259
+ # the total information from even ncbi and all search
core/standardize_location.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, os
2
+ import requests
3
+ import core.model
4
+ # Normalize input
5
+ def normalize_key(text):
6
+ return re.sub(r"[^a-z0-9]", "", text.strip().lower())
7
+
8
+ # Search for city/place (normal flow)
9
+ def get_country_from_geonames(city_name):
10
+ url = os.environ["URL_SEARCHJSON"]
11
+ username = os.environ["USERNAME_GEO"]
12
+ print("geoname: ", cityname)
13
+ params = {
14
+ "q": city_name,
15
+ "maxRows": 1,
16
+ "username": username
17
+ }
18
+ try:
19
+ r = requests.get(url, params=params, timeout=5)
20
+ data = r.json()
21
+ if data.get("geonames"):
22
+ return data["geonames"][0]["countryName"]
23
+ except Exception as e:
24
+ print("GeoNames searchJSON error:", e)
25
+ return None
26
+
27
+ # Search for country info using alpha-2/3 codes or name
28
+ def get_country_from_countryinfo(input_code):
29
+ url = os.environ["URL_COUNTRYJSON"]
30
+ username = os.environ["USERNAME_GEO"]
31
+ print("countryINFO: ", input_code)
32
+ params = {
33
+ "username": username
34
+ }
35
+ try:
36
+ r = requests.get(url, params=params, timeout=5)
37
+ data = r.json()
38
+ if data.get("geonames"):
39
+ input_code = input_code.strip().upper()
40
+ for country in data["geonames"]:
41
+ # Match against country name, country code (alpha-2), iso alpha-3
42
+ if input_code in [
43
+ country.get("countryName", "").upper(),
44
+ country.get("countryCode", "").upper(),
45
+ country.get("isoAlpha3", "").upper()
46
+ ]:
47
+ return country["countryName"]
48
+ except Exception as e:
49
+ print("GeoNames countryInfoJSON error:", e)
50
+ return None
51
+
52
+ # Combined smart lookup
53
+ def smart_country_lookup(user_input):
54
+ try:
55
+ raw_input = user_input.strip()
56
+ normalized = re.sub(r"[^a-zA-Z0-9]", "", user_input).upper() # normalize for codes (no strip spaces!)
57
+ print("raw input for smart country lookup: ",raw_input, ". Normalized country: ", normalized)
58
+ # Special case: if user writes "UK: London" → split and take main country part
59
+ if ":" in raw_input:
60
+ raw_input = raw_input.split(":")[0].strip() # only take "UK"
61
+ # First try as country code (if 2-3 letters or common abbreviation)
62
+ if len(normalized) <= 3:
63
+ if normalized.upper() in ["UK","U.K","U.K."]:
64
+ country = get_country_from_geonames(normalized.upper())
65
+ print("get_country_from_geonames(normalized.upper()) ", country)
66
+ if country:
67
+ return country
68
+ else:
69
+ country = get_country_from_countryinfo(raw_input)
70
+ print("get_country_from_countryinfo(raw_input) ", country)
71
+ if country:
72
+ return country
73
+ print(raw_input)
74
+ country = get_country_from_countryinfo(raw_input) # try full names
75
+ print("get_country_from_countryinfo(raw_input) ", country)
76
+ if country:
77
+ return country
78
+ # Otherwise, treat as city/place
79
+ country = get_country_from_geonames(raw_input)
80
+ print("get_country_from_geonames(raw_input) ", country)
81
+ if country:
82
+ return country
83
+
84
+ return "Not found"
85
+ except:
86
+ country = model.get_country_from_text(user_input)
87
+ if country.lower() !="unknown":
88
+ return country
89
+ else:
90
+ return "Not found"
core/upgradeClassify.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from nltk.tokenize import sent_tokenize, word_tokenize
4
+ import nltk
5
+ nltk.download('punkt_tab')
6
+ #import coreferee
7
+ import copy
8
+ from sentence_transformers import SentenceTransformer, util
9
+ from sklearn.cluster import DBSCAN
10
+ from sklearn.metrics.pairwise import cosine_distances
11
+ from collections import defaultdict
12
+ import numpy as np
13
+ #from mtdna_classifier import infer_fromQAModel
14
+ # 1. SENTENCE-BERT MODEL
15
+ # Step 1: Preprocess the text
16
+ def normalize_text(text):
17
+ # Normalize various separators to "-"
18
+ text = re.sub(r'\s*(–+|—+|--+>|–>|->|-->|to|→|➝|➔|➡)\s*', '-', text, flags=re.IGNORECASE)
19
+ # Fix GEN10GEN30 → GEN10-GEN30
20
+ text = re.sub(r'\b([a-zA-Z]+)(\d+)(\1)(\d+)\b', r'\1\2-\1\4', text)
21
+ # Fix GEN10-30 → GEN10-GEN30
22
+ text = re.sub(r'\b([a-zA-Z]+)(\d+)-(\d+)\b', r'\1\2-\1\3', text)
23
+ return text
24
+
25
+ def preprocess_text(text):
26
+ normalized = normalize_text(text)
27
+ sentences = sent_tokenize(normalized)
28
+ return [re.sub(r"[^a-zA-Z0-9\s\-]", "", s).strip() for s in sentences]
29
+
30
+ # Before step 2, check NLP cache to avoid calling it muliple times:
31
+ # Global model cache
32
+ _spacy_models = {}
33
+
34
+ def get_spacy_model(model_name, add_coreferee=False):
35
+ global _spacy_models
36
+ if model_name not in _spacy_models:
37
+ nlp = spacy.load(model_name)
38
+ if add_coreferee and "coreferee" not in nlp.pipe_names:
39
+ nlp.add_pipe("coreferee")
40
+ _spacy_models[model_name] = nlp
41
+ return _spacy_models[model_name]
42
+
43
+ # Step 2: NER to Extract Locations and Sample Names
44
+ def extract_entities(text, sample_id=None):
45
+ nlp = get_spacy_model("en_core_web_sm")
46
+ doc = nlp(text)
47
+
48
+ # Filter entities by GPE, but exclude things that match sample ID format
49
+ gpe_candidates = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
50
+
51
+ # Remove entries that match SAMPLE ID patterns like XXX123 or similar
52
+ gpe_filtered = [gpe for gpe in gpe_candidates if not re.fullmatch(r'[A-Z]{2,5}\d{2,4}', gpe.strip())]
53
+
54
+ # Optional: further filter known invalid patterns (e.g., things shorter than 3 chars, numeric only)
55
+ gpe_filtered = [gpe for gpe in gpe_filtered if len(gpe) > 2 and not gpe.strip().isdigit()]
56
+
57
+ if sample_id is None:
58
+ return list(set(gpe_filtered)), []
59
+ else:
60
+ sample_prefix = re.match(r'[A-Z]+', sample_id).group()
61
+ samples = re.findall(rf'{sample_prefix}\d+', text)
62
+ return list(set(gpe_filtered)), list(set(samples))
63
+
64
+ # Step 3: Build a Soft Matching Layer
65
+ # Handle patterns like "BRU1–BRU20" and identify BRU18 as part of it.
66
+ def is_sample_in_range(sample_id, sentence):
67
+ # Match prefix up to digits
68
+ sample_prefix_match = re.match(r'^([A-Z0-9]+?)(?=\d+$)', sample_id)
69
+ sample_number_match = re.search(r'(\d+)$', sample_id)
70
+
71
+ if not sample_prefix_match or not sample_number_match:
72
+ return False
73
+
74
+ sample_prefix = sample_prefix_match.group(1)
75
+ sample_number = int(sample_number_match.group(1))
76
+ sentence = normalize_text(sentence)
77
+ # Case 1: Full prefix on both sides
78
+ pattern1 = rf'{sample_prefix}(\d+)\s*-\s*{sample_prefix}(\d+)'
79
+ for match in re.findall(pattern1, sentence):
80
+ start, end = int(match[0]), int(match[1])
81
+ if start <= sample_number <= end:
82
+ return True
83
+
84
+ # Case 2: Prefix only on first number
85
+ pattern2 = rf'{sample_prefix}(\d+)\s*-\s*(\d+)'
86
+ for match in re.findall(pattern2, sentence):
87
+ start, end = int(match[0]), int(match[1])
88
+ if start <= sample_number <= end:
89
+ return True
90
+
91
+ return False
92
+
93
+ # Step 4: Use coreferree to merge the sentences have same coreference # still cannot cause packages conflict
94
+ # ========== HEURISTIC GROUP → LOCATION MAPPERS ==========
95
+ # === Generalized version to replace your old extract_sample_to_group_general ===
96
+ # === Generalized version to replace your old extract_group_to_location_general ===
97
+ def extract_population_locations(text):
98
+ text = normalize_text(text)
99
+ pattern = r'([A-Za-z ,\-]+)\n([A-Z]+\d*)\n([A-Za-z ,\-]+)\n([A-Za-z ,\-]+)'
100
+ pop_to_location = {}
101
+
102
+ for match in re.finditer(pattern, text, flags=re.IGNORECASE):
103
+ _, pop_code, region, country = match.groups()
104
+ pop_to_location[pop_code.upper()] = f"{region.strip()}\n{country.strip()}"
105
+
106
+ return pop_to_location
107
+
108
+ def extract_sample_ranges(text):
109
+ text = normalize_text(text)
110
+ # Updated pattern to handle punctuation and line breaks
111
+ pattern = r'\b([A-Z0-9]+\d+)[–\-]([A-Z0-9]+\d+)[,:\.\s]*([A-Z0-9]+\d+)\b'
112
+ sample_to_pop = {}
113
+ for match in re.finditer(pattern, text, flags=re.IGNORECASE):
114
+ start_id, end_id, pop_code = match.groups()
115
+ start_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', start_id, re.IGNORECASE).group(1).upper()
116
+ end_prefix = re.match(r'^([A-Z0-9]+?)(?=\d+$)', end_id, re.IGNORECASE).group(1).upper()
117
+ if start_prefix != end_prefix:
118
+ continue
119
+ start_num = int(re.search(r'(\d+)$', start_id).group())
120
+ end_num = int(re.search(r'(\d+)$', end_id).group())
121
+ for i in range(start_num, end_num + 1):
122
+ sample_id = f"{start_prefix}{i:03d}"
123
+ sample_to_pop[sample_id] = pop_code.upper()
124
+
125
+ return sample_to_pop
126
+
127
+ def filter_context_for_sample(sample_id, full_text, window_size=2):
128
+
129
+ # Normalize and tokenize
130
+ full_text = normalize_text(full_text)
131
+ sentences = sent_tokenize(full_text)
132
+
133
+ # Step 1: Find indices with direct mention or range match
134
+ match_indices = [
135
+ i for i, s in enumerate(sentences)
136
+ if sample_id in s or is_sample_in_range(sample_id, s)
137
+ ]
138
+
139
+ # Step 2: Get sample → group mapping from full text
140
+ sample_to_group = extract_sample_ranges(full_text)
141
+ group_id = sample_to_group.get(sample_id)
142
+
143
+ # Step 3: Find group-related sentences
144
+ group_indices = []
145
+ if group_id:
146
+ for i, s in enumerate(sentences):
147
+ if group_id in s:
148
+ group_indices.append(i)
149
+
150
+ # Step 4: Collect sentences within window
151
+ selected_indices = set()
152
+ if len(match_indices + group_indices) > 0:
153
+ for i in match_indices + group_indices:
154
+ start = max(0, i - window_size)
155
+ end = min(len(sentences), i + window_size + 1)
156
+ selected_indices.update(range(start, end))
157
+
158
+ filtered_sentences = [sentences[i] for i in sorted(selected_indices)]
159
+ return " ".join(filtered_sentences)
160
+ return full_text
161
+ # Load the SpaCy transformer model with coreferee
162
+ def mergeCorefSen(text):
163
+ sen = preprocess_text(text)
164
+ return sen
165
+
166
+ # Before step 5 and below, let check transformer cache to avoid calling again
167
+ # Global SBERT model cache
168
+ _sbert_models = {}
169
+
170
+ def get_sbert_model(model_name="all-MiniLM-L6-v2"):
171
+ global _sbert_models
172
+ if model_name not in _sbert_models:
173
+ _sbert_models[model_name] = SentenceTransformer(model_name)
174
+ return _sbert_models[model_name]
175
+
176
+ # Step 5: Sentence-BERT retriever → Find top paragraphs related to keyword.
177
+ '''Use sentence transformers to embed the sentence that mentions the sample and
178
+ compare it to sentences that mention locations.'''
179
+
180
+ def find_top_para(sample_id, text,top_k=5):
181
+ sentences = mergeCorefSen(text)
182
+ model = get_sbert_model("all-mpnet-base-v2")
183
+ embeddings = model.encode(sentences, convert_to_tensor=True)
184
+
185
+ # Find the sentence that best matches the sample_id
186
+ sample_matches = [s for s in sentences if sample_id in s or is_sample_in_range(sample_id, s)]
187
+ if not sample_matches:
188
+ return [],"No context found for sample"
189
+
190
+ sample_embedding = model.encode(sample_matches[0], convert_to_tensor=True)
191
+ cos_scores = util.pytorch_cos_sim(sample_embedding, embeddings)[0]
192
+
193
+ # Get top-k most similar sentence indices
194
+ top_indices = cos_scores.argsort(descending=True)[:top_k]
195
+ return top_indices, sentences
196
+
197
+ # Step 6: DBSCAN to cluster the group of similar paragraphs.
198
+ def clusterPara(tokens):
199
+ # Load Sentence-BERT model
200
+ sbert_model = get_sbert_model("all-mpnet-base-v2")
201
+ sentence_embeddings = sbert_model.encode(tokens)
202
+
203
+ # Compute cosine distance matrix
204
+ distance_matrix = cosine_distances(sentence_embeddings)
205
+
206
+ # DBSCAN clustering
207
+ clustering_model = DBSCAN(eps=0.3, min_samples=1, metric="precomputed")
208
+ cluster_labels = clustering_model.fit_predict(distance_matrix)
209
+
210
+ # Group sentences by cluster
211
+ clusters = defaultdict(list)
212
+ cluster_embeddings = defaultdict(list)
213
+ sentence_to_cluster = {}
214
+ for i, label in enumerate(cluster_labels):
215
+ clusters[label].append(tokens[i])
216
+ cluster_embeddings[label].append(sentence_embeddings[i])
217
+ sentence_to_cluster[tokens[i]] = label
218
+ # Compute cluster centroids
219
+ centroids = {
220
+ label: np.mean(embs, axis=0)
221
+ for label, embs in cluster_embeddings.items()
222
+ }
223
+ return clusters, sentence_to_cluster, centroids
224
+
225
+ def rankSenFromCluster(clusters, sentence_to_cluster, centroids, target_sentence):
226
+ target_cluster = sentence_to_cluster[target_sentence]
227
+ target_centroid = centroids[target_cluster]
228
+ sen_rank = []
229
+ sen_order = list(sentence_to_cluster.keys())
230
+ # Compute distances to other cluster centroids
231
+ dists = []
232
+ for label, centroid in centroids.items():
233
+ dist = cosine_distances([target_centroid], [centroid])[0][0]
234
+ dists.append((label, dist))
235
+ dists.sort(key=lambda x: x[1]) # sort by proximity
236
+ for d in dists:
237
+ cluster = clusters[d[0]]
238
+ for sen in cluster:
239
+ if sen != target_sentence:
240
+ sen_rank.append(sen_order.index(sen))
241
+ return sen_rank
242
+ # Step 7: Final Inference Wrapper
243
+ def infer_location_for_sample(sample_id, context_text):
244
+ # Go through each of the top sentences in order
245
+ top_indices, sentences = find_top_para(sample_id, context_text,top_k=5)
246
+ if top_indices==[] or sentences == "No context found for sample":
247
+ return "No clear location found in top matches"
248
+ clusters, sentence_to_cluster, centroids = clusterPara(sentences)
249
+ topRankSen_DBSCAN = []
250
+ mostTopSen = ""
251
+ locations = ""
252
+ i = 0
253
+ while len(locations) == 0 or i < len(top_indices):
254
+ # Firstly, start with the top-ranked Sentence-BERT result
255
+ idx = top_indices[i]
256
+ best_sentence = sentences[idx]
257
+ if i == 0:
258
+ mostTopSen = best_sentence
259
+ locations, _ = extract_entities(best_sentence, sample_id)
260
+ if locations:
261
+ return locations
262
+ # If no location, then look for sample overlap in the same DBSCAN cluster
263
+ # Compute distances to other cluster centroids
264
+ if len(topRankSen_DBSCAN)==0 and mostTopSen:
265
+ topRankSen_DBSCAN = rankSenFromCluster(clusters, sentence_to_cluster, centroids, mostTopSen)
266
+ if i >= len(topRankSen_DBSCAN): break
267
+ idx_DBSCAN = topRankSen_DBSCAN[i]
268
+ best_sentence_DBSCAN = sentences[idx_DBSCAN]
269
+ locations, _ = extract_entities(best_sentence, sample_id)
270
+ if locations:
271
+ return locations
272
+ # If no, then backtrack to next best Sentence-BERT sentence (such as 2nd rank sentence), and repeat step 1 and 2 until run out
273
+ i += 1
274
+ # Last resort: LLM (e.g. chatGPT, deepseek, etc.)
275
+ #if len(locations) == 0:
276
+ return "No clear location found in top matches"
env.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ name: mtDNA
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - python=3.10
6
+ - pip
7
+ - pip:
8
+ - -r requirements.txt