VyLala commited on
Commit
ed9311e
·
verified ·
1 Parent(s): 67cedb6

Update data_preprocess.py

Browse files
Files changed (1) hide show
  1. data_preprocess.py +876 -777
data_preprocess.py CHANGED
@@ -1,778 +1,877 @@
1
- import re
2
- import os
3
- #import streamlit as st
4
- import subprocess
5
- import re
6
- from Bio import Entrez
7
- from docx import Document
8
- import fitz
9
- import spacy
10
- from spacy.cli import download
11
- from NER.PDF import pdf
12
- from NER.WordDoc import wordDoc
13
- from NER.html import extractHTML
14
- from NER.word2Vec import word2vec
15
- #from transformers import pipeline
16
- import urllib.parse, requests
17
- from pathlib import Path
18
- import pandas as pd
19
- import model
20
- import pipeline
21
- import tempfile
22
- import nltk
23
- nltk.download('punkt_tab')
24
- def download_excel_file(url, save_path="temp.xlsx"):
25
- if "view.officeapps.live.com" in url:
26
- parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
- real_url = urllib.parse.unquote(parsed_url["src"][0])
28
- response = requests.get(real_url)
29
- with open(save_path, "wb") as f:
30
- f.write(response.content)
31
- return save_path
32
- elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
- response = requests.get(url)
34
- response.raise_for_status() # Raises error if download fails
35
- with open(save_path, "wb") as f:
36
- f.write(response.content)
37
- print(len(response.content))
38
- return save_path
39
- else:
40
- print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
- return url
42
- def extract_text(link,saveFolder):
43
- try:
44
- text = ""
45
- name = link.split("/")[-1]
46
- print("name: ", name)
47
- #file_path = Path(saveFolder) / name
48
- local_temp_path = os.path.join(tempfile.gettempdir(), name)
49
- print("this is local temp path: ", local_temp_path)
50
- if os.path.exists(local_temp_path):
51
- input_to_class = local_temp_path
52
- print("exist")
53
- else:
54
- #input_to_class = link # Let the class handle downloading
55
- # 1. Check if file exists in shared Google Drive folder
56
- file_id = pipeline.find_drive_file(name, saveFolder)
57
- if file_id:
58
- print("📥 Downloading from Google Drive...")
59
- pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
60
- else:
61
- print("🌐 Downloading from web link...")
62
- response = requests.get(link)
63
- with open(local_temp_path, 'wb') as f:
64
- f.write(response.content)
65
- print("✅ Saved locally.")
66
-
67
- # 2. Upload to Drive so it's available for later
68
- pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
69
-
70
- input_to_class = local_temp_path
71
- print(input_to_class)
72
- # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
73
- # pdf
74
- if link.endswith(".pdf"):
75
- # if file_path.is_file():
76
- # link = saveFolder + "/" + name
77
- # print("File exists.")
78
- #p = pdf.PDF(local_temp_path, saveFolder)
79
- print("inside pdf and input to class: ", input_to_class)
80
- print("save folder in extract text: ", saveFolder)
81
- #p = pdf.PDF(input_to_class, saveFolder)
82
- #p = pdf.PDF(link,saveFolder)
83
- #text = p.extractTextWithPDFReader()
84
- #text = p.extractText()
85
- p = pdf.PDFFast(input_to_class, saveFolder)
86
- text = p.extract_text()
87
-
88
- print("len text from pdf:")
89
- print(len(text))
90
- #text_exclude_table = p.extract_text_excluding_tables()
91
- # worddoc
92
- elif link.endswith(".doc") or link.endswith(".docx"):
93
- #d = wordDoc.wordDoc(local_temp_path,saveFolder)
94
- # d = wordDoc.wordDoc(input_to_class,saveFolder)
95
- # text = d.extractTextByPage()
96
- d = wordDoc.WordDocFast(input_to_class, saveFolder)
97
- text = d.extractText()
98
-
99
- # html
100
- else:
101
- if link.split(".")[-1].lower() not in "xlsx":
102
- if "http" in link or "html" in link:
103
- print("html link: ", link)
104
- html = extractHTML.HTML("",link)
105
- text = html.getListSection() # the text already clean
106
- print("len text html: ")
107
- print(len(text))
108
- # Cleanup: delete the local temp file
109
- if name:
110
- if os.path.exists(local_temp_path):
111
- os.remove(local_temp_path)
112
- print(f"🧹 Deleted local temp file: {local_temp_path}")
113
- print("done extract text")
114
- except:
115
- text = ""
116
- return text
117
-
118
- def extract_table(link,saveFolder):
119
- try:
120
- table = []
121
- name = link.split("/")[-1]
122
- #file_path = Path(saveFolder) / name
123
- local_temp_path = os.path.join(tempfile.gettempdir(), name)
124
- if os.path.exists(local_temp_path):
125
- input_to_class = local_temp_path
126
- print("exist")
127
- else:
128
- #input_to_class = link # Let the class handle downloading
129
- # 1. Check if file exists in shared Google Drive folder
130
- file_id = pipeline.find_drive_file(name, saveFolder)
131
- if file_id:
132
- print("📥 Downloading from Google Drive...")
133
- pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
134
- else:
135
- print("🌐 Downloading from web link...")
136
- response = requests.get(link)
137
- with open(local_temp_path, 'wb') as f:
138
- f.write(response.content)
139
- print("✅ Saved locally.")
140
-
141
- # 2. Upload to Drive so it's available for later
142
- pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
143
-
144
- input_to_class = local_temp_path
145
- print(input_to_class)
146
- #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
147
- # pdf
148
- if link.endswith(".pdf"):
149
- # if file_path.is_file():
150
- # link = saveFolder + "/" + name
151
- # print("File exists.")
152
- #p = pdf.PDF(local_temp_path,saveFolder)
153
- p = pdf.PDF(input_to_class,saveFolder)
154
- table = p.extractTable()
155
- # worddoc
156
- elif link.endswith(".doc") or link.endswith(".docx"):
157
- #d = wordDoc.wordDoc(local_temp_path,saveFolder)
158
- # d = wordDoc.wordDoc(input_to_class,saveFolder)
159
- # table = d.extractTableAsList()
160
- d = wordDoc.WordDocFast(input_to_class, saveFolder)
161
- table = d.extractTableAsList()
162
- # excel
163
- elif link.split(".")[-1].lower() in "xlsx":
164
- # download excel file if it not downloaded yet
165
- savePath = saveFolder +"/"+ link.split("/")[-1]
166
- excelPath = download_excel_file(link, savePath)
167
- try:
168
- #xls = pd.ExcelFile(excelPath)
169
- xls = pd.ExcelFile(local_temp_path)
170
- table_list = []
171
- for sheet_name in xls.sheet_names:
172
- df = pd.read_excel(xls, sheet_name=sheet_name)
173
- cleaned_table = df.fillna("").astype(str).values.tolist()
174
- table_list.append(cleaned_table)
175
- table = table_list
176
- except Exception as e:
177
- print("❌ Failed to extract tables from Excel:", e)
178
- # html
179
- elif "http" in link or "html" in link:
180
- html = extractHTML.HTML("",link)
181
- table = html.extractTable() # table is a list
182
- table = clean_tables_format(table)
183
- # Cleanup: delete the local temp file
184
- if os.path.exists(local_temp_path):
185
- os.remove(local_temp_path)
186
- print(f"🧹 Deleted local temp file: {local_temp_path}")
187
- except:
188
- table = []
189
- return table
190
-
191
- def clean_tables_format(tables):
192
- """
193
- Ensures all tables are in consistent format: List[List[List[str]]]
194
- Cleans by:
195
- - Removing empty strings and rows
196
- - Converting all cells to strings
197
- - Handling DataFrames and list-of-lists
198
- """
199
- cleaned = []
200
- if tables:
201
- for table in tables:
202
- standardized = []
203
-
204
- # Case 1: Pandas DataFrame
205
- if isinstance(table, pd.DataFrame):
206
- table = table.fillna("").astype(str).values.tolist()
207
-
208
- # Case 2: List of Lists
209
- if isinstance(table, list) and all(isinstance(row, list) for row in table):
210
- for row in table:
211
- filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
212
- if filtered_row:
213
- standardized.append(filtered_row)
214
-
215
- if standardized:
216
- cleaned.append(standardized)
217
-
218
- return cleaned
219
-
220
- import json
221
- def normalize_text_for_comparison(s: str) -> str:
222
- """
223
- Normalizes text for robust comparison by:
224
- 1. Converting to lowercase.
225
- 2. Replacing all types of newlines with a single consistent newline (\n).
226
- 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
227
- 4. Stripping leading/trailing whitespace from the entire string.
228
- """
229
- s = s.lower()
230
- s = s.replace('\r\n', '\n') # Handle Windows newlines
231
- s = s.replace('\r', '\n') # Handle Mac classic newlines
232
-
233
- # Replace sequences of whitespace (including multiple newlines) with a single space
234
- # This might be too aggressive if you need to preserve paragraph breaks,
235
- # but good for exact word-sequence matching.
236
- s = re.sub(r'\s+', ' ', s)
237
-
238
- return s.strip()
239
- def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
240
- """
241
- Merge cleaned text and table into one string for LLM input.
242
- - Avoids duplicating tables already in text
243
- - Extracts only relevant rows from large tables
244
- - Skips or saves oversized tables
245
- """
246
- import importlib
247
- json = importlib.import_module("json")
248
-
249
- def estimate_tokens(text_str):
250
- try:
251
- enc = tiktoken.get_encoding(tokenizer)
252
- return len(enc.encode(text_str))
253
- except:
254
- return len(text_str) // 4 # Fallback estimate
255
-
256
- def is_table_relevant(table, keywords, accession_id=None):
257
- flat = " ".join(" ".join(row).lower() for row in table)
258
- if accession_id and accession_id.lower() in flat:
259
- return True
260
- return any(kw.lower() in flat for kw in keywords)
261
- preview, preview1 = "",""
262
- llm_input = "## Document Text\n" + text.strip() + "\n"
263
- clean_text = normalize_text_for_comparison(text)
264
-
265
- if tables:
266
- for idx, table in enumerate(tables):
267
- keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
268
- if accession_id: keywords += [accession_id.lower()]
269
- if isolate: keywords += [isolate.lower()]
270
- if is_table_relevant(table, keywords, accession_id):
271
- if len(table) > 0:
272
- for tab in table:
273
- preview = " ".join(tab) if tab else ""
274
- preview1 = "\n".join(tab) if tab else ""
275
- clean_preview = normalize_text_for_comparison(preview)
276
- clean_preview1 = normalize_text_for_comparison(preview1)
277
- if clean_preview not in clean_text:
278
- if clean_preview1 not in clean_text:
279
- table_str = json.dumps([tab], indent=2)
280
- llm_input += f"## Table {idx+1}\n{table_str}\n"
281
- return llm_input.strip()
282
-
283
- def preprocess_document(link, saveFolder, accession=None, isolate=None, article_text=None):
284
- if article_text:
285
- print("article text already available")
286
- text = article_text
287
- else:
288
- try:
289
- print("start preprocess and extract text")
290
- text = extract_text(link, saveFolder)
291
- except: text = ""
292
- try:
293
- print("extract table start")
294
- success, the_output = pipeline.run_with_timeout(extract_table,args=(link,saveFolder),timeout=10)
295
- print("Returned from timeout logic")
296
- if success:
297
- tables = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
298
- print("yes succeed for extract table")
299
- else:
300
- print("not suceed etxract table")
301
- tables = []
302
- #tables = extract_table(link, saveFolder)
303
- except: tables = []
304
- if accession: accession = accession
305
- if isolate: isolate = isolate
306
- try:
307
- # print("merge text and table start")
308
- # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30)
309
- # print("Returned from timeout logic")
310
- # if success:
311
- # final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
312
- # print("yes succeed")
313
- # else:
314
- # print("not suceed")
315
- print("just merge text and tables")
316
- final_input = text + ", ".join(tables)
317
- #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
318
- except:
319
- print("no succeed here in preprocess docu")
320
- final_input = ""
321
- return text, tables, final_input
322
-
323
- def extract_sentences(text):
324
- sentences = re.split(r'(?<=[.!?])\s+', text)
325
- return [s.strip() for s in sentences if s.strip()]
326
-
327
- def is_irrelevant_number_sequence(text):
328
- if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
329
- return False
330
- word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
331
- number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
332
- total_tokens = len(re.findall(r'\S+', text))
333
- if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
334
- return True
335
- elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
336
- return True
337
- return False
338
-
339
- def remove_isolated_single_digits(sentence):
340
- tokens = sentence.split()
341
- filtered_tokens = []
342
- for token in tokens:
343
- if token == '0' or token == '1':
344
- pass
345
- else:
346
- filtered_tokens.append(token)
347
- return ' '.join(filtered_tokens).strip()
348
-
349
- def get_contextual_sentences_BFS(text_content, keyword, depth=2):
350
- def extract_codes(sentence):
351
- # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
352
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
353
- sentences = extract_sentences(text_content)
354
- relevant_sentences = set()
355
- initial_keywords = set()
356
-
357
- # Define a regex to capture codes like A1YU101 or KM1
358
- # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
359
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
360
-
361
- # Attempt to parse the keyword into its prefix and numerical part using re.search
362
- keyword_match = code_pattern.search(keyword)
363
-
364
- keyword_prefix = None
365
- keyword_num = None
366
-
367
- if keyword_match:
368
- keyword_prefix = keyword_match.group(1).lower()
369
- keyword_num = int(keyword_match.group(2))
370
-
371
- for sentence in sentences:
372
- sentence_added = False
373
-
374
- # 1. Check for exact match of the keyword
375
- if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
376
- relevant_sentences.add(sentence.strip())
377
- initial_keywords.add(keyword.lower())
378
- sentence_added = True
379
-
380
- # 2. Check for range patterns (e.g., A1YU101-A1YU137)
381
- # The range pattern should be broad enough to capture the full code string within the range.
382
- range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
383
- range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
384
-
385
- for r_match in range_matches:
386
- start_code_str = r_match.group(1)
387
- end_code_str = r_match.group(2)
388
-
389
- # CRITICAL FIX: Use code_pattern.search for start_match and end_match
390
- start_match = code_pattern.search(start_code_str)
391
- end_match = code_pattern.search(end_code_str)
392
-
393
- if keyword_prefix and keyword_num is not None and start_match and end_match:
394
- start_prefix = start_match.group(1).lower()
395
- end_prefix = end_match.group(1).lower()
396
- start_num = int(start_match.group(2))
397
- end_num = int(end_match.group(2))
398
-
399
- # Check if the keyword's prefix matches and its number is within the range
400
- if keyword_prefix == start_prefix and \
401
- keyword_prefix == end_prefix and \
402
- start_num <= keyword_num <= end_num:
403
- relevant_sentences.add(sentence.strip())
404
- initial_keywords.add(start_code_str.lower())
405
- initial_keywords.add(end_code_str.lower())
406
- sentence_added = True
407
- break # Only need to find one matching range per sentence
408
-
409
- # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
410
- # to initial_keywords to ensure graph traversal from related terms.
411
- if sentence_added:
412
- for word in extract_codes(sentence):
413
- initial_keywords.add(word.lower())
414
-
415
-
416
- # Build word_to_sentences mapping for all sentences
417
- word_to_sentences = {}
418
- for sent in sentences:
419
- codes_in_sent = set(extract_codes(sent))
420
- for code in codes_in_sent:
421
- word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
422
-
423
-
424
- # Build the graph
425
- graph = {}
426
- for sent in sentences:
427
- codes = set(extract_codes(sent))
428
- for word1 in codes:
429
- word1_lower = word1.lower()
430
- graph.setdefault(word1_lower, set())
431
- for word2 in codes:
432
- word2_lower = word2.lower()
433
- if word1_lower != word2_lower:
434
- graph[word1_lower].add(word2_lower)
435
-
436
-
437
- # Perform BFS/graph traversal
438
- queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
439
- visited_words = set(initial_keywords)
440
-
441
- while queue:
442
- current_word, level = queue.pop(0)
443
- if level >= depth:
444
- continue
445
-
446
- relevant_sentences.update(word_to_sentences.get(current_word, []))
447
-
448
- for neighbor in graph.get(current_word, []):
449
- if neighbor not in visited_words:
450
- visited_words.add(neighbor)
451
- queue.append((neighbor, level + 1))
452
-
453
- final_sentences = set()
454
- for sentence in relevant_sentences:
455
- if not is_irrelevant_number_sequence(sentence):
456
- processed_sentence = remove_isolated_single_digits(sentence)
457
- if processed_sentence:
458
- final_sentences.add(processed_sentence)
459
-
460
- return "\n".join(sorted(list(final_sentences)))
461
-
462
-
463
-
464
- def get_contextual_sentences_DFS(text_content, keyword, depth=2):
465
- sentences = extract_sentences(text_content)
466
-
467
- # Build word-to-sentences mapping
468
- word_to_sentences = {}
469
- for sent in sentences:
470
- words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
471
- for word in words_in_sent:
472
- word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
473
-
474
- # Function to extract codes in a sentence
475
- def extract_codes(sentence):
476
- # Only codes like 'KSK1', 'MG272794', not pure numbers
477
- return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
478
-
479
- # DFS with priority based on distance to keyword and early stop if country found
480
- def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
481
- country = "unknown"
482
- if current_depth > max_depth:
483
- return country, False
484
-
485
- if current_word not in word_to_sentences:
486
- return country, False
487
-
488
- for sentence in word_to_sentences[current_word]:
489
- if sentence == parent_sentence:
490
- continue # avoid reusing the same sentence
491
-
492
- collected_sentences.add(sentence)
493
-
494
- #print("current_word:", current_word)
495
- small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
496
- #print(small_sen)
497
- country = model.get_country_from_text(small_sen)
498
- #print("small context country:", country)
499
- if country.lower() != "unknown":
500
- return country, True
501
- else:
502
- country = model.get_country_from_text(sentence)
503
- #print("full sentence country:", country)
504
- if country.lower() != "unknown":
505
- return country, True
506
-
507
- codes_in_sentence = extract_codes(sentence)
508
- idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
509
- if idx is None:
510
- continue
511
-
512
- sorted_children = sorted(
513
- [code for code in codes_in_sentence if code.lower() not in visited_words],
514
- key=lambda x: (abs(codes_in_sentence.index(x) - idx),
515
- 0 if codes_in_sentence.index(x) > idx else 1)
516
- )
517
-
518
- #print("sorted_children:", sorted_children)
519
- for child in sorted_children:
520
- child_lower = child.lower()
521
- if child_lower not in visited_words:
522
- visited_words.add(child_lower)
523
- country, should_stop = dfs_traverse(
524
- child_lower, current_depth + 1, max_depth,
525
- visited_words, collected_sentences, parent_sentence=sentence
526
- )
527
- if should_stop:
528
- return country, True
529
-
530
- return country, False
531
-
532
- # Begin DFS
533
- collected_sentences = set()
534
- visited_words = set([keyword.lower()])
535
- country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
536
-
537
- # Filter irrelevant sentences
538
- final_sentences = set()
539
- for sentence in collected_sentences:
540
- if not is_irrelevant_number_sequence(sentence):
541
- processed = remove_isolated_single_digits(sentence)
542
- if processed:
543
- final_sentences.add(processed)
544
- if not final_sentences:
545
- return country, text_content
546
- return country, "\n".join(sorted(list(final_sentences)))
547
-
548
- # Helper function for normalizing text for overlap comparison
549
- def normalize_for_overlap(s: str) -> str:
550
- s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
551
- s = re.sub(r'\s+', ' ', s).strip()
552
- return s
553
-
554
- def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
555
- if not text1: return text2
556
- if not text2: return text1
557
-
558
- # Case 1: text2 is fully contained in text1 or vice-versa
559
- if text2 in text1:
560
- return text1
561
- if text1 in text2:
562
- return text2
563
-
564
- # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
565
- # This is what your function was primarily designed for.
566
- # It looks for the overlap at the "junction" of text1 and text2.
567
-
568
- max_junction_overlap = 0
569
- for i in range(min(len(text1), len(text2)), 0, -1):
570
- suffix1 = text1[-i:]
571
- prefix2 = text2[:i]
572
- # Prioritize exact match, then normalized match
573
- if suffix1 == prefix2:
574
- max_junction_overlap = i
575
- break
576
- elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
577
- max_junction_overlap = i
578
- break # Take the first (longest) normalized match
579
-
580
- if max_junction_overlap > 0:
581
- merged_text = text1 + text2[max_junction_overlap:]
582
- return re.sub(r'\s+', ' ', merged_text).strip()
583
-
584
- # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
585
- # This addresses your specific test case where the overlap is at the very beginning of both strings.
586
- # This is often used when trying to deduplicate content that shares a common start.
587
-
588
- longest_common_prefix_len = 0
589
- min_len = min(len(text1), len(text2))
590
- for i in range(min_len):
591
- if text1[i] == text2[i]:
592
- longest_common_prefix_len = i + 1
593
- else:
594
- break
595
-
596
- # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
597
- # AND the remaining parts are distinct, then apply this merge.
598
- # This is a heuristic and might need fine-tuning.
599
- if longest_common_prefix_len > 0 and \
600
- text1[longest_common_prefix_len:].strip() and \
601
- text2[longest_common_prefix_len:].strip():
602
-
603
- # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
604
- # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
605
- # common prefix is "Hi, I am Vy."
606
- # Remaining text1: " Nice to meet you."
607
- # Remaining text2: " Goodbye Vy."
608
- # So we merge common_prefix + remaining_text1 + remaining_text2
609
-
610
- common_prefix_str = text1[:longest_common_prefix_len]
611
- remainder_text1 = text1[longest_common_prefix_len:]
612
- remainder_text2 = text2[longest_common_prefix_len:]
613
-
614
- merged_text = common_prefix_str + remainder_text1 + remainder_text2
615
- return re.sub(r'\s+', ' ', merged_text).strip()
616
-
617
-
618
- # If neither specific overlap type is found, just concatenate
619
- merged_text = text1 + text2
620
- return re.sub(r'\s+', ' ', merged_text).strip()
621
-
622
- from docx import Document
623
- from pipeline import upload_file_to_drive
624
- # def save_text_to_docx(text_content: str, file_path: str):
625
- # """
626
- # Saves a given text string into a .docx file.
627
-
628
- # Args:
629
- # text_content (str): The text string to save.
630
- # file_path (str): The full path including the filename where the .docx file will be saved.
631
- # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
632
- # """
633
- # try:
634
- # document = Document()
635
-
636
- # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
637
- # for paragraph_text in text_content.split('\n'):
638
- # document.add_paragraph(paragraph_text)
639
-
640
- # document.save(file_path)
641
- # print(f"Text successfully saved to '{file_path}'")
642
- # except Exception as e:
643
- # print(f"Error saving text to docx file: {e}")
644
- # def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
645
- # """
646
- # Saves a given text string into a .docx file locally, then uploads to Google Drive.
647
-
648
- # Args:
649
- # text_content (str): The text string to save.
650
- # filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
651
- # drive_folder_id (str): Google Drive folder ID where to upload the file.
652
- # """
653
- # try:
654
- # # Save to temporary local path first
655
- # print("file name: ", filename)
656
- # print("length text content: ", len(text_content))
657
- # local_path = os.path.join(tempfile.gettempdir(), filename)
658
- # document = Document()
659
- # for paragraph_text in text_content.split('\n'):
660
- # document.add_paragraph(paragraph_text)
661
- # document.save(local_path)
662
- # print(f"✅ Text saved locally to: {local_path}")
663
-
664
- # # Upload to Drive
665
- # pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
666
- # print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
667
-
668
- # except Exception as e:
669
- # print(f"❌ Error saving or uploading DOCX: {e}")
670
- def save_text_to_docx(text_content: str, full_local_path: str):
671
- document = Document()
672
- for paragraph_text in text_content.split('\n'):
673
- document.add_paragraph(paragraph_text)
674
- document.save(full_local_path)
675
- print(f"✅ Saved DOCX locally: {full_local_path}")
676
-
677
-
678
-
679
- '''2 scenerios:
680
- - quick look then found then deepdive and directly get location then stop
681
- - quick look then found then deepdive but not find location then hold the related words then
682
- look another files iteratively for each related word and find location and stop'''
683
- def extract_context(text, keyword, window=500):
684
- # firstly try accession number
685
- code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
686
-
687
- # Attempt to parse the keyword into its prefix and numerical part using re.search
688
- keyword_match = code_pattern.search(keyword)
689
-
690
- keyword_prefix = None
691
- keyword_num = None
692
-
693
- if keyword_match:
694
- keyword_prefix = keyword_match.group(1).lower()
695
- keyword_num = int(keyword_match.group(2))
696
- text = text.lower()
697
- idx = text.find(keyword.lower())
698
- if idx == -1:
699
- if keyword_prefix:
700
- idx = text.find(keyword_prefix)
701
- if idx == -1:
702
- return "Sample ID not found."
703
- return text[max(0, idx-window): idx+window]
704
- return text[max(0, idx-window): idx+window]
705
- def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
706
- cache = {}
707
- country = "unknown"
708
- output = ""
709
- tem_output, small_output = "",""
710
- keyword_appear = (False,"")
711
- keywords = []
712
- if isolate: keywords.append(isolate)
713
- if accession: keywords.append(accession)
714
- for f in filePaths:
715
- # scenerio 1: direct location: truncate the context and then use qa model?
716
- if keywords:
717
- for keyword in keywords:
718
- text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
719
- if keyword in final_input:
720
- context = extract_context(final_input, keyword)
721
- # quick look if country already in context and if yes then return
722
- country = model.get_country_from_text(context)
723
- if country != "unknown":
724
- return country, context, final_input
725
- else:
726
- country = model.get_country_from_text(final_input)
727
- if country != "unknown":
728
- return country, context, final_input
729
- else: # might be cross-ref
730
- keyword_appear = (True, f)
731
- cache[f] = context
732
- small_output = merge_texts_skipping_overlap(output, context) + "\n"
733
- chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
734
- countryBFS = model.get_country_from_text(chunkBFS)
735
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
736
- output = merge_texts_skipping_overlap(output, final_input)
737
- if countryDFS != "unknown" and countryBFS != "unknown":
738
- if len(chunkDFS) <= len(chunkBFS):
739
- return countryDFS, chunkDFS, output
740
- else:
741
- return countryBFS, chunkBFS, output
742
- else:
743
- if countryDFS != "unknown":
744
- return countryDFS, chunkDFS, output
745
- if countryBFS != "unknown":
746
- return countryBFS, chunkBFS, output
747
- else:
748
- # scenerio 2:
749
- '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
750
- but if we look at file 1 first then maybe we can have lookup dict which country
751
- such as Thailand as the key and its re'''
752
- cache[f] = final_input
753
- if keyword_appear[0] == True:
754
- for c in cache:
755
- if c!=keyword_appear[1]:
756
- if cache[c].lower() not in output.lower():
757
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
758
- chunkBFS = get_contextual_sentences_BFS(output, keyword)
759
- countryBFS = model.get_country_from_text(chunkBFS)
760
- countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
761
- if countryDFS != "unknown" and countryBFS != "unknown":
762
- if len(chunkDFS) <= len(chunkBFS):
763
- return countryDFS, chunkDFS, output
764
- else:
765
- return countryBFS, chunkBFS, output
766
- else:
767
- if countryDFS != "unknown":
768
- return countryDFS, chunkDFS, output
769
- if countryBFS != "unknown":
770
- return countryBFS, chunkBFS, output
771
- else:
772
- if cache[f].lower() not in output.lower():
773
- output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
774
- if len(output) == 0 or keyword_appear[0]==False:
775
- for c in cache:
776
- if cache[c].lower() not in output.lower():
777
- output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  return country, "", output
 
1
+ import re
2
+ import os
3
+ #import streamlit as st
4
+ import subprocess
5
+ import re
6
+ from Bio import Entrez
7
+ from docx import Document
8
+ import fitz
9
+ import spacy
10
+ from spacy.cli import download
11
+ from NER.PDF import pdf
12
+ from NER.WordDoc import wordDoc
13
+ from NER.html import extractHTML
14
+ from NER.word2Vec import word2vec
15
+ #from transformers import pipeline
16
+ import urllib.parse, requests
17
+ from pathlib import Path
18
+ import pandas as pd
19
+ import model
20
+ import pipeline
21
+ import tempfile
22
+ import nltk
23
+ nltk.download('punkt_tab')
24
+ def download_excel_file(url, save_path="temp.xlsx"):
25
+ if "view.officeapps.live.com" in url:
26
+ parsed_url = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
27
+ real_url = urllib.parse.unquote(parsed_url["src"][0])
28
+ response = requests.get(real_url)
29
+ with open(save_path, "wb") as f:
30
+ f.write(response.content)
31
+ return save_path
32
+ elif url.startswith("http") and (url.endswith(".xls") or url.endswith(".xlsx")):
33
+ response = requests.get(url)
34
+ response.raise_for_status() # Raises error if download fails
35
+ with open(save_path, "wb") as f:
36
+ f.write(response.content)
37
+ print(len(response.content))
38
+ return save_path
39
+ else:
40
+ print("URL must point directly to an .xls or .xlsx file\n or it already downloaded.")
41
+ return url
42
+
43
+ from pathlib import Path
44
+ import pandas as pd
45
+
46
+ def process_file(link, saveFolder):
47
+ """Returns (file_type, full_path, name) for a given link."""
48
+ name = Path(link).name
49
+ ext = Path(name).suffix.lower()
50
+ file_path = Path(saveFolder) / name
51
+
52
+ # If it's already in saveFolder, update link to local path
53
+ if file_path.is_file():
54
+ link = str(file_path)
55
+
56
+ return ext, link, file_path
57
+
58
+ import asyncio
59
+ import aiohttp
60
+ _html_cache = {}
61
+
62
+ async def async_fetch_html(link: str, timeout: int = 15) -> str:
63
+ """Fetch HTML asynchronously with caching."""
64
+ if link in _html_cache:
65
+ return _html_cache[link]
66
+
67
+ try:
68
+ async with aiohttp.ClientSession() as session:
69
+ async with session.get(link, timeout=timeout) as resp:
70
+ if resp.status != 200:
71
+ print(f"⚠️ Failed {link} ({resp.status})")
72
+ return ""
73
+ html_content = await resp.text()
74
+ _html_cache[link] = html_content
75
+ return html_content
76
+ except Exception as e:
77
+ print(f" async_fetch_html error for {link}: {e}")
78
+ return ""
79
+
80
+ async def ensure_local_file(link: str, saveFolder: str) -> str:
81
+ """Ensure file is available locally (Drive or web). Returns local path."""
82
+ name = link.split("/")[-1]
83
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
84
+
85
+ if os.path.exists(local_temp_path):
86
+ return local_temp_path
87
+
88
+ # Try Drive first (blocking offload)
89
+ file_id = await asyncio.to_thread(pipeline.find_drive_file, name, saveFolder)
90
+ if file_id:
91
+ await asyncio.to_thread(pipeline.download_file_from_drive, name, saveFolder, local_temp_path)
92
+ else:
93
+ # Web download asynchronously
94
+ async with aiohttp.ClientSession() as session:
95
+ async with session.get(link, timeout=20) as resp:
96
+ resp.raise_for_status()
97
+ content = await resp.read()
98
+ with open(local_temp_path, "wb") as f:
99
+ f.write(content)
100
+ # Upload back to Drive (offload)
101
+ await asyncio.to_thread(pipeline.upload_file_to_drive, local_temp_path, name, saveFolder)
102
+
103
+ return local_temp_path
104
+
105
+ async def async_extract_text(link, saveFolder):
106
+ try:
107
+ if link.endswith(".pdf"):
108
+ local_path = await ensure_local_file(link, saveFolder)
109
+ return await asyncio.to_thread(lambda: pdf.PDFFast(local_path, saveFolder).extract_text())
110
+
111
+ elif link.endswith((".doc", ".docx")):
112
+ local_path = await ensure_local_file(link, saveFolder)
113
+ return await asyncio.to_thread(lambda: wordDoc.WordDocFast(local_path, saveFolder).extractText())
114
+
115
+ elif link.endswith((".xls", ".xlsx")):
116
+ return ""
117
+
118
+ elif link.startswith("http") or "html" in link:
119
+ html_content = await async_fetch_html(link)
120
+ html = extractHTML.HTML(htmlContent=html_content, htmlLink=link, htmlFile="")
121
+ # If you implement async_getListSection, call it here
122
+ if hasattr(html, "async_getListSection"):
123
+ article_text = await html.async_getListSection()
124
+ else:
125
+ # fallback: run sync getListSection in a thread
126
+ article_text = await asyncio.to_thread(html.getListSection)
127
+
128
+ if not article_text:
129
+ metadata_text = html.fetch_crossref_metadata(link)
130
+ if metadata_text:
131
+ article_text = html.mergeTextInJson(metadata_text)
132
+ return article_text
133
+
134
+ else:
135
+ return ""
136
+ except Exception as e:
137
+ print(f"❌ async_extract_text failed for {link}: {e}")
138
+ return ""
139
+
140
+
141
+ def extract_text(link,saveFolder):
142
+ try:
143
+ text = ""
144
+ name = link.split("/")[-1]
145
+ print("name: ", name)
146
+ #file_path = Path(saveFolder) / name
147
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
148
+ print("this is local temp path: ", local_temp_path)
149
+ if os.path.exists(local_temp_path):
150
+ input_to_class = local_temp_path
151
+ print("exist")
152
+ else:
153
+ #input_to_class = link # Let the class handle downloading
154
+ # 1. Check if file exists in shared Google Drive folder
155
+ file_id = pipeline.find_drive_file(name, saveFolder)
156
+ if file_id:
157
+ print("📥 Downloading from Google Drive...")
158
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
159
+ else:
160
+ print("🌐 Downloading from web link...")
161
+ response = requests.get(link)
162
+ with open(local_temp_path, 'wb') as f:
163
+ f.write(response.content)
164
+ print("✅ Saved locally.")
165
+
166
+ # 2. Upload to Drive so it's available for later
167
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
168
+
169
+ input_to_class = local_temp_path
170
+ print(input_to_class)
171
+ # pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
172
+ # pdf
173
+ if link.endswith(".pdf"):
174
+ # if file_path.is_file():
175
+ # link = saveFolder + "/" + name
176
+ # print("File exists.")
177
+ #p = pdf.PDF(local_temp_path, saveFolder)
178
+ print("inside pdf and input to class: ", input_to_class)
179
+ print("save folder in extract text: ", saveFolder)
180
+ #p = pdf.PDF(input_to_class, saveFolder)
181
+ #p = pdf.PDF(link,saveFolder)
182
+ #text = p.extractTextWithPDFReader()
183
+ #text = p.extractText()
184
+ p = pdf.PDFFast(input_to_class, saveFolder)
185
+ text = p.extract_text()
186
+
187
+ print("len text from pdf:")
188
+ print(len(text))
189
+ #text_exclude_table = p.extract_text_excluding_tables()
190
+ # worddoc
191
+ elif link.endswith(".doc") or link.endswith(".docx"):
192
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
193
+ # d = wordDoc.wordDoc(input_to_class,saveFolder)
194
+ # text = d.extractTextByPage()
195
+ d = wordDoc.WordDocFast(input_to_class, saveFolder)
196
+ text = d.extractText()
197
+
198
+ # html
199
+ else:
200
+ if link.split(".")[-1].lower() not in "xlsx":
201
+ if "http" in link or "html" in link:
202
+ print("html link: ", link)
203
+ html = extractHTML.HTML("",link)
204
+ text = html.getListSection() # the text already clean
205
+ print("len text html: ")
206
+ print(len(text))
207
+ # Cleanup: delete the local temp file
208
+ if name:
209
+ if os.path.exists(local_temp_path):
210
+ os.remove(local_temp_path)
211
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
212
+ print("done extract text")
213
+ except:
214
+ text = ""
215
+ return text
216
+
217
+ def extract_table(link,saveFolder):
218
+ try:
219
+ table = []
220
+ name = link.split("/")[-1]
221
+ #file_path = Path(saveFolder) / name
222
+ local_temp_path = os.path.join(tempfile.gettempdir(), name)
223
+ if os.path.exists(local_temp_path):
224
+ input_to_class = local_temp_path
225
+ print("exist")
226
+ else:
227
+ #input_to_class = link # Let the class handle downloading
228
+ # 1. Check if file exists in shared Google Drive folder
229
+ file_id = pipeline.find_drive_file(name, saveFolder)
230
+ if file_id:
231
+ print("📥 Downloading from Google Drive...")
232
+ pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
233
+ else:
234
+ print("🌐 Downloading from web link...")
235
+ response = requests.get(link)
236
+ with open(local_temp_path, 'wb') as f:
237
+ f.write(response.content)
238
+ print("✅ Saved locally.")
239
+
240
+ # 2. Upload to Drive so it's available for later
241
+ pipeline.upload_file_to_drive(local_temp_path, name, saveFolder)
242
+
243
+ input_to_class = local_temp_path
244
+ print(input_to_class)
245
+ #pipeline.download_file_from_drive(name, saveFolder, local_temp_path)
246
+ # pdf
247
+ if link.endswith(".pdf"):
248
+ # if file_path.is_file():
249
+ # link = saveFolder + "/" + name
250
+ # print("File exists.")
251
+ #p = pdf.PDF(local_temp_path,saveFolder)
252
+ p = pdf.PDF(input_to_class,saveFolder)
253
+ table = p.extractTable()
254
+ # worddoc
255
+ elif link.endswith(".doc") or link.endswith(".docx"):
256
+ #d = wordDoc.wordDoc(local_temp_path,saveFolder)
257
+ # d = wordDoc.wordDoc(input_to_class,saveFolder)
258
+ # table = d.extractTableAsList()
259
+ d = wordDoc.WordDocFast(input_to_class, saveFolder)
260
+ table = d.extractTableAsList()
261
+ # excel
262
+ elif link.split(".")[-1].lower() in "xlsx":
263
+ # download excel file if it not downloaded yet
264
+ savePath = saveFolder +"/"+ link.split("/")[-1]
265
+ excelPath = download_excel_file(link, savePath)
266
+ try:
267
+ #xls = pd.ExcelFile(excelPath)
268
+ xls = pd.ExcelFile(local_temp_path)
269
+ table_list = []
270
+ for sheet_name in xls.sheet_names:
271
+ df = pd.read_excel(xls, sheet_name=sheet_name)
272
+ cleaned_table = df.fillna("").astype(str).values.tolist()
273
+ table_list.append(cleaned_table)
274
+ table = table_list
275
+ except Exception as e:
276
+ print("❌ Failed to extract tables from Excel:", e)
277
+ # html
278
+ elif "http" in link or "html" in link:
279
+ html = extractHTML.HTML("",link)
280
+ table = html.extractTable() # table is a list
281
+ table = clean_tables_format(table)
282
+ # Cleanup: delete the local temp file
283
+ if os.path.exists(local_temp_path):
284
+ os.remove(local_temp_path)
285
+ print(f"🧹 Deleted local temp file: {local_temp_path}")
286
+ except:
287
+ table = []
288
+ return table
289
+
290
+ def clean_tables_format(tables):
291
+ """
292
+ Ensures all tables are in consistent format: List[List[List[str]]]
293
+ Cleans by:
294
+ - Removing empty strings and rows
295
+ - Converting all cells to strings
296
+ - Handling DataFrames and list-of-lists
297
+ """
298
+ cleaned = []
299
+ if tables:
300
+ for table in tables:
301
+ standardized = []
302
+
303
+ # Case 1: Pandas DataFrame
304
+ if isinstance(table, pd.DataFrame):
305
+ table = table.fillna("").astype(str).values.tolist()
306
+
307
+ # Case 2: List of Lists
308
+ if isinstance(table, list) and all(isinstance(row, list) for row in table):
309
+ for row in table:
310
+ filtered_row = [str(cell).strip() for cell in row if str(cell).strip()]
311
+ if filtered_row:
312
+ standardized.append(filtered_row)
313
+
314
+ if standardized:
315
+ cleaned.append(standardized)
316
+
317
+ return cleaned
318
+
319
+ import json
320
+ def normalize_text_for_comparison(s: str) -> str:
321
+ """
322
+ Normalizes text for robust comparison by:
323
+ 1. Converting to lowercase.
324
+ 2. Replacing all types of newlines with a single consistent newline (\n).
325
+ 3. Removing extra spaces (e.g., multiple spaces, leading/trailing spaces on lines).
326
+ 4. Stripping leading/trailing whitespace from the entire string.
327
+ """
328
+ s = s.lower()
329
+ s = s.replace('\r\n', '\n') # Handle Windows newlines
330
+ s = s.replace('\r', '\n') # Handle Mac classic newlines
331
+
332
+ # Replace sequences of whitespace (including multiple newlines) with a single space
333
+ # This might be too aggressive if you need to preserve paragraph breaks,
334
+ # but good for exact word-sequence matching.
335
+ s = re.sub(r'\s+', ' ', s)
336
+
337
+ return s.strip()
338
+ def merge_text_and_tables(text, tables, max_tokens=12000, keep_tables=True, tokenizer="cl100k_base", accession_id=None, isolate=None):
339
+ """
340
+ Merge cleaned text and table into one string for LLM input.
341
+ - Avoids duplicating tables already in text
342
+ - Extracts only relevant rows from large tables
343
+ - Skips or saves oversized tables
344
+ """
345
+ import importlib
346
+ json = importlib.import_module("json")
347
+
348
+ def estimate_tokens(text_str):
349
+ try:
350
+ enc = tiktoken.get_encoding(tokenizer)
351
+ return len(enc.encode(text_str))
352
+ except:
353
+ return len(text_str) // 4 # Fallback estimate
354
+
355
+ def is_table_relevant(table, keywords, accession_id=None):
356
+ flat = " ".join(" ".join(row).lower() for row in table)
357
+ if accession_id and accession_id.lower() in flat:
358
+ return True
359
+ return any(kw.lower() in flat for kw in keywords)
360
+ preview, preview1 = "",""
361
+ llm_input = "## Document Text\n" + text.strip() + "\n"
362
+ clean_text = normalize_text_for_comparison(text)
363
+
364
+ if tables:
365
+ for idx, table in enumerate(tables):
366
+ keywords = ["province","district","region","village","location", "country", "region", "origin", "ancient", "modern"]
367
+ if accession_id: keywords += [accession_id.lower()]
368
+ if isolate: keywords += [isolate.lower()]
369
+ if is_table_relevant(table, keywords, accession_id):
370
+ if len(table) > 0:
371
+ for tab in table:
372
+ preview = " ".join(tab) if tab else ""
373
+ preview1 = "\n".join(tab) if tab else ""
374
+ clean_preview = normalize_text_for_comparison(preview)
375
+ clean_preview1 = normalize_text_for_comparison(preview1)
376
+ if clean_preview not in clean_text:
377
+ if clean_preview1 not in clean_text:
378
+ table_str = json.dumps([tab], indent=2)
379
+ llm_input += f"## Table {idx+1}\n{table_str}\n"
380
+ return llm_input.strip()
381
+
382
+ def preprocess_document(link, saveFolder, accession=None, isolate=None, article_text=None):
383
+ if article_text:
384
+ print("article text already available")
385
+ text = article_text
386
+ else:
387
+ try:
388
+ print("start preprocess and extract text")
389
+ text = extract_text(link, saveFolder)
390
+ except: text = ""
391
+ try:
392
+ print("extract table start")
393
+ success, the_output = pipeline.run_with_timeout(extract_table,args=(link,saveFolder),timeout=10)
394
+ print("Returned from timeout logic")
395
+ if success:
396
+ tables = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
397
+ print("yes succeed for extract table")
398
+ else:
399
+ print("not suceed etxract table")
400
+ tables = []
401
+ #tables = extract_table(link, saveFolder)
402
+ except: tables = []
403
+ if accession: accession = accession
404
+ if isolate: isolate = isolate
405
+ try:
406
+ # print("merge text and table start")
407
+ # success, the_output = pipeline.run_with_timeout(merge_text_and_tables,kwargs={"text":text,"tables":tables,"accession_id":accession, "isolate":isolate},timeout=30)
408
+ # print("Returned from timeout logic")
409
+ # if success:
410
+ # final_input = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
411
+ # print("yes succeed")
412
+ # else:
413
+ # print("not suceed")
414
+ print("just merge text and tables")
415
+ final_input = text + ", ".join(tables)
416
+ #final_input = pipeline.timeout(merge_text_and_tables(text, tables, max_tokens=12000, accession_id=accession, isolate=isolate)
417
+ except:
418
+ print("no succeed here in preprocess docu")
419
+ final_input = ""
420
+ return text, tables, final_input
421
+
422
+ def extract_sentences(text):
423
+ sentences = re.split(r'(?<=[.!?])\s+', text)
424
+ return [s.strip() for s in sentences if s.strip()]
425
+
426
+ def is_irrelevant_number_sequence(text):
427
+ if re.search(r'\b[A-Z]{2,}\d+\b|\b[A-Za-z]+\s+\d+\b', text, re.IGNORECASE):
428
+ return False
429
+ word_count = len(re.findall(r'\b[A-Za-z]{2,}\b', text))
430
+ number_count = len(re.findall(r'\b\d[\d\.]*\b', text))
431
+ total_tokens = len(re.findall(r'\S+', text))
432
+ if total_tokens > 0 and (word_count / total_tokens < 0.2) and (number_count / total_tokens > 0.5):
433
+ return True
434
+ elif re.fullmatch(r'(\d+(\.\d+)?\s*)+', text.strip()):
435
+ return True
436
+ return False
437
+
438
+ def remove_isolated_single_digits(sentence):
439
+ tokens = sentence.split()
440
+ filtered_tokens = []
441
+ for token in tokens:
442
+ if token == '0' or token == '1':
443
+ pass
444
+ else:
445
+ filtered_tokens.append(token)
446
+ return ' '.join(filtered_tokens).strip()
447
+
448
+ def get_contextual_sentences_BFS(text_content, keyword, depth=2):
449
+ def extract_codes(sentence):
450
+ # Match codes like 'A1YU101', 'KM1', 'MO6' — at least 2 letters + numbers
451
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
452
+ sentences = extract_sentences(text_content)
453
+ relevant_sentences = set()
454
+ initial_keywords = set()
455
+
456
+ # Define a regex to capture codes like A1YU101 or KM1
457
+ # This pattern looks for an alphanumeric sequence followed by digits at the end of the string
458
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
459
+
460
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
461
+ keyword_match = code_pattern.search(keyword)
462
+
463
+ keyword_prefix = None
464
+ keyword_num = None
465
+
466
+ if keyword_match:
467
+ keyword_prefix = keyword_match.group(1).lower()
468
+ keyword_num = int(keyword_match.group(2))
469
+
470
+ for sentence in sentences:
471
+ sentence_added = False
472
+
473
+ # 1. Check for exact match of the keyword
474
+ if re.search(r'\b' + re.escape(keyword) + r'\b', sentence, re.IGNORECASE):
475
+ relevant_sentences.add(sentence.strip())
476
+ initial_keywords.add(keyword.lower())
477
+ sentence_added = True
478
+
479
+ # 2. Check for range patterns (e.g., A1YU101-A1YU137)
480
+ # The range pattern should be broad enough to capture the full code string within the range.
481
+ range_matches = re.finditer(r'([A-Z0-9]+-\d+)', sentence, re.IGNORECASE) # More specific range pattern if needed, or rely on full code pattern below
482
+ range_matches = re.finditer(r'([A-Z0-9]+\d+)-([A-Z0-9]+\d+)', sentence, re.IGNORECASE) # This is the more robust range pattern
483
+
484
+ for r_match in range_matches:
485
+ start_code_str = r_match.group(1)
486
+ end_code_str = r_match.group(2)
487
+
488
+ # CRITICAL FIX: Use code_pattern.search for start_match and end_match
489
+ start_match = code_pattern.search(start_code_str)
490
+ end_match = code_pattern.search(end_code_str)
491
+
492
+ if keyword_prefix and keyword_num is not None and start_match and end_match:
493
+ start_prefix = start_match.group(1).lower()
494
+ end_prefix = end_match.group(1).lower()
495
+ start_num = int(start_match.group(2))
496
+ end_num = int(end_match.group(2))
497
+
498
+ # Check if the keyword's prefix matches and its number is within the range
499
+ if keyword_prefix == start_prefix and \
500
+ keyword_prefix == end_prefix and \
501
+ start_num <= keyword_num <= end_num:
502
+ relevant_sentences.add(sentence.strip())
503
+ initial_keywords.add(start_code_str.lower())
504
+ initial_keywords.add(end_code_str.lower())
505
+ sentence_added = True
506
+ break # Only need to find one matching range per sentence
507
+
508
+ # 3. If the sentence was added due to exact match or range, add all its alphanumeric codes
509
+ # to initial_keywords to ensure graph traversal from related terms.
510
+ if sentence_added:
511
+ for word in extract_codes(sentence):
512
+ initial_keywords.add(word.lower())
513
+
514
+
515
+ # Build word_to_sentences mapping for all sentences
516
+ word_to_sentences = {}
517
+ for sent in sentences:
518
+ codes_in_sent = set(extract_codes(sent))
519
+ for code in codes_in_sent:
520
+ word_to_sentences.setdefault(code.lower(), set()).add(sent.strip())
521
+
522
+
523
+ # Build the graph
524
+ graph = {}
525
+ for sent in sentences:
526
+ codes = set(extract_codes(sent))
527
+ for word1 in codes:
528
+ word1_lower = word1.lower()
529
+ graph.setdefault(word1_lower, set())
530
+ for word2 in codes:
531
+ word2_lower = word2.lower()
532
+ if word1_lower != word2_lower:
533
+ graph[word1_lower].add(word2_lower)
534
+
535
+
536
+ # Perform BFS/graph traversal
537
+ queue = [(k, 0) for k in initial_keywords if k in word_to_sentences]
538
+ visited_words = set(initial_keywords)
539
+
540
+ while queue:
541
+ current_word, level = queue.pop(0)
542
+ if level >= depth:
543
+ continue
544
+
545
+ relevant_sentences.update(word_to_sentences.get(current_word, []))
546
+
547
+ for neighbor in graph.get(current_word, []):
548
+ if neighbor not in visited_words:
549
+ visited_words.add(neighbor)
550
+ queue.append((neighbor, level + 1))
551
+
552
+ final_sentences = set()
553
+ for sentence in relevant_sentences:
554
+ if not is_irrelevant_number_sequence(sentence):
555
+ processed_sentence = remove_isolated_single_digits(sentence)
556
+ if processed_sentence:
557
+ final_sentences.add(processed_sentence)
558
+
559
+ return "\n".join(sorted(list(final_sentences)))
560
+
561
+
562
+
563
+ def get_contextual_sentences_DFS(text_content, keyword, depth=2):
564
+ sentences = extract_sentences(text_content)
565
+
566
+ # Build word-to-sentences mapping
567
+ word_to_sentences = {}
568
+ for sent in sentences:
569
+ words_in_sent = set(re.findall(r'\b[A-Za-z0-9\-_\/]+\b', sent))
570
+ for word in words_in_sent:
571
+ word_to_sentences.setdefault(word.lower(), set()).add(sent.strip())
572
+
573
+ # Function to extract codes in a sentence
574
+ def extract_codes(sentence):
575
+ # Only codes like 'KSK1', 'MG272794', not pure numbers
576
+ return [code for code in re.findall(r'\b[A-Z]{2,}[0-9]+\b', sentence, re.IGNORECASE)]
577
+
578
+ # DFS with priority based on distance to keyword and early stop if country found
579
+ def dfs_traverse(current_word, current_depth, max_depth, visited_words, collected_sentences, parent_sentence=None):
580
+ country = "unknown"
581
+ if current_depth > max_depth:
582
+ return country, False
583
+
584
+ if current_word not in word_to_sentences:
585
+ return country, False
586
+
587
+ for sentence in word_to_sentences[current_word]:
588
+ if sentence == parent_sentence:
589
+ continue # avoid reusing the same sentence
590
+
591
+ collected_sentences.add(sentence)
592
+
593
+ #print("current_word:", current_word)
594
+ small_sen = extract_context(sentence, current_word, int(len(sentence) / 4))
595
+ #print(small_sen)
596
+ country = model.get_country_from_text(small_sen)
597
+ #print("small context country:", country)
598
+ if country.lower() != "unknown":
599
+ return country, True
600
+ else:
601
+ country = model.get_country_from_text(sentence)
602
+ #print("full sentence country:", country)
603
+ if country.lower() != "unknown":
604
+ return country, True
605
+
606
+ codes_in_sentence = extract_codes(sentence)
607
+ idx = next((i for i, code in enumerate(codes_in_sentence) if code.lower() == current_word.lower()), None)
608
+ if idx is None:
609
+ continue
610
+
611
+ sorted_children = sorted(
612
+ [code for code in codes_in_sentence if code.lower() not in visited_words],
613
+ key=lambda x: (abs(codes_in_sentence.index(x) - idx),
614
+ 0 if codes_in_sentence.index(x) > idx else 1)
615
+ )
616
+
617
+ #print("sorted_children:", sorted_children)
618
+ for child in sorted_children:
619
+ child_lower = child.lower()
620
+ if child_lower not in visited_words:
621
+ visited_words.add(child_lower)
622
+ country, should_stop = dfs_traverse(
623
+ child_lower, current_depth + 1, max_depth,
624
+ visited_words, collected_sentences, parent_sentence=sentence
625
+ )
626
+ if should_stop:
627
+ return country, True
628
+
629
+ return country, False
630
+
631
+ # Begin DFS
632
+ collected_sentences = set()
633
+ visited_words = set([keyword.lower()])
634
+ country, status = dfs_traverse(keyword.lower(), 0, depth, visited_words, collected_sentences)
635
+
636
+ # Filter irrelevant sentences
637
+ final_sentences = set()
638
+ for sentence in collected_sentences:
639
+ if not is_irrelevant_number_sequence(sentence):
640
+ processed = remove_isolated_single_digits(sentence)
641
+ if processed:
642
+ final_sentences.add(processed)
643
+ if not final_sentences:
644
+ return country, text_content
645
+ return country, "\n".join(sorted(list(final_sentences)))
646
+
647
+ # Helper function for normalizing text for overlap comparison
648
+ def normalize_for_overlap(s: str) -> str:
649
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s).lower()
650
+ s = re.sub(r'\s+', ' ', s).strip()
651
+ return s
652
+
653
+ def merge_texts_skipping_overlap(text1: str, text2: str) -> str:
654
+ if not text1: return text2
655
+ if not text2: return text1
656
+
657
+ # Case 1: text2 is fully contained in text1 or vice-versa
658
+ if text2 in text1:
659
+ return text1
660
+ if text1 in text2:
661
+ return text2
662
+
663
+ # --- Option 1: Original behavior (suffix of text1, prefix of text2) ---
664
+ # This is what your function was primarily designed for.
665
+ # It looks for the overlap at the "junction" of text1 and text2.
666
+
667
+ max_junction_overlap = 0
668
+ for i in range(min(len(text1), len(text2)), 0, -1):
669
+ suffix1 = text1[-i:]
670
+ prefix2 = text2[:i]
671
+ # Prioritize exact match, then normalized match
672
+ if suffix1 == prefix2:
673
+ max_junction_overlap = i
674
+ break
675
+ elif normalize_for_overlap(suffix1) == normalize_for_overlap(prefix2):
676
+ max_junction_overlap = i
677
+ break # Take the first (longest) normalized match
678
+
679
+ if max_junction_overlap > 0:
680
+ merged_text = text1 + text2[max_junction_overlap:]
681
+ return re.sub(r'\s+', ' ', merged_text).strip()
682
+
683
+ # --- Option 2: Longest Common Prefix (for cases like "Hi, I am Vy.") ---
684
+ # This addresses your specific test case where the overlap is at the very beginning of both strings.
685
+ # This is often used when trying to deduplicate content that shares a common start.
686
+
687
+ longest_common_prefix_len = 0
688
+ min_len = min(len(text1), len(text2))
689
+ for i in range(min_len):
690
+ if text1[i] == text2[i]:
691
+ longest_common_prefix_len = i + 1
692
+ else:
693
+ break
694
+
695
+ # If a common prefix is found AND it's a significant portion (e.g., more than a few chars)
696
+ # AND the remaining parts are distinct, then apply this merge.
697
+ # This is a heuristic and might need fine-tuning.
698
+ if longest_common_prefix_len > 0 and \
699
+ text1[longest_common_prefix_len:].strip() and \
700
+ text2[longest_common_prefix_len:].strip():
701
+
702
+ # Only merge this way if the remaining parts are not empty (i.e., not exact duplicates)
703
+ # For "Hi, I am Vy. Nice to meet you." and "Hi, I am Vy. Goodbye Vy."
704
+ # common prefix is "Hi, I am Vy."
705
+ # Remaining text1: " Nice to meet you."
706
+ # Remaining text2: " Goodbye Vy."
707
+ # So we merge common_prefix + remaining_text1 + remaining_text2
708
+
709
+ common_prefix_str = text1[:longest_common_prefix_len]
710
+ remainder_text1 = text1[longest_common_prefix_len:]
711
+ remainder_text2 = text2[longest_common_prefix_len:]
712
+
713
+ merged_text = common_prefix_str + remainder_text1 + remainder_text2
714
+ return re.sub(r'\s+', ' ', merged_text).strip()
715
+
716
+
717
+ # If neither specific overlap type is found, just concatenate
718
+ merged_text = text1 + text2
719
+ return re.sub(r'\s+', ' ', merged_text).strip()
720
+
721
+ from docx import Document
722
+ from pipeline import upload_file_to_drive
723
+ # def save_text_to_docx(text_content: str, file_path: str):
724
+ # """
725
+ # Saves a given text string into a .docx file.
726
+
727
+ # Args:
728
+ # text_content (str): The text string to save.
729
+ # file_path (str): The full path including the filename where the .docx file will be saved.
730
+ # Example: '/content/drive/MyDrive/CollectData/Examples/test/SEA_1234/merged_document.docx'
731
+ # """
732
+ # try:
733
+ # document = Document()
734
+
735
+ # # Add the entire text as a single paragraph, or split by newlines for multiple paragraphs
736
+ # for paragraph_text in text_content.split('\n'):
737
+ # document.add_paragraph(paragraph_text)
738
+
739
+ # document.save(file_path)
740
+ # print(f"Text successfully saved to '{file_path}'")
741
+ # except Exception as e:
742
+ # print(f"Error saving text to docx file: {e}")
743
+ # def save_text_to_docx(text_content: str, filename: str, drive_folder_id: str):
744
+ # """
745
+ # Saves a given text string into a .docx file locally, then uploads to Google Drive.
746
+
747
+ # Args:
748
+ # text_content (str): The text string to save.
749
+ # filename (str): The target .docx file name, e.g. 'BRU18_merged_document.docx'.
750
+ # drive_folder_id (str): Google Drive folder ID where to upload the file.
751
+ # """
752
+ # try:
753
+ # # Save to temporary local path first
754
+ # print("file name: ", filename)
755
+ # print("length text content: ", len(text_content))
756
+ # local_path = os.path.join(tempfile.gettempdir(), filename)
757
+ # document = Document()
758
+ # for paragraph_text in text_content.split('\n'):
759
+ # document.add_paragraph(paragraph_text)
760
+ # document.save(local_path)
761
+ # print(f"✅ Text saved locally to: {local_path}")
762
+
763
+ # # Upload to Drive
764
+ # pipeline.upload_file_to_drive(local_path, filename, drive_folder_id)
765
+ # print(f"✅ Uploaded '{filename}' to Google Drive folder ID: {drive_folder_id}")
766
+
767
+ # except Exception as e:
768
+ # print(f"❌ Error saving or uploading DOCX: {e}")
769
+ def save_text_to_docx(text_content: str, full_local_path: str):
770
+ document = Document()
771
+ for paragraph_text in text_content.split('\n'):
772
+ document.add_paragraph(paragraph_text)
773
+ document.save(full_local_path)
774
+ print(f"✅ Saved DOCX locally: {full_local_path}")
775
+
776
+
777
+
778
+ '''2 scenerios:
779
+ - quick look then found then deepdive and directly get location then stop
780
+ - quick look then found then deepdive but not find location then hold the related words then
781
+ look another files iteratively for each related word and find location and stop'''
782
+ def extract_context(text, keyword, window=500):
783
+ # firstly try accession number
784
+ code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
785
+
786
+ # Attempt to parse the keyword into its prefix and numerical part using re.search
787
+ keyword_match = code_pattern.search(keyword)
788
+
789
+ keyword_prefix = None
790
+ keyword_num = None
791
+
792
+ if keyword_match:
793
+ keyword_prefix = keyword_match.group(1).lower()
794
+ keyword_num = int(keyword_match.group(2))
795
+ text = text.lower()
796
+ idx = text.find(keyword.lower())
797
+ if idx == -1:
798
+ if keyword_prefix:
799
+ idx = text.find(keyword_prefix)
800
+ if idx == -1:
801
+ return "Sample ID not found."
802
+ return text[max(0, idx-window): idx+window]
803
+ return text[max(0, idx-window): idx+window]
804
+ def process_inputToken(filePaths, saveLinkFolder,accession=None, isolate=None):
805
+ cache = {}
806
+ country = "unknown"
807
+ output = ""
808
+ tem_output, small_output = "",""
809
+ keyword_appear = (False,"")
810
+ keywords = []
811
+ if isolate: keywords.append(isolate)
812
+ if accession: keywords.append(accession)
813
+ for f in filePaths:
814
+ # scenerio 1: direct location: truncate the context and then use qa model?
815
+ if keywords:
816
+ for keyword in keywords:
817
+ text, tables, final_input = preprocess_document(f,saveLinkFolder, isolate=keyword)
818
+ if keyword in final_input:
819
+ context = extract_context(final_input, keyword)
820
+ # quick look if country already in context and if yes then return
821
+ country = model.get_country_from_text(context)
822
+ if country != "unknown":
823
+ return country, context, final_input
824
+ else:
825
+ country = model.get_country_from_text(final_input)
826
+ if country != "unknown":
827
+ return country, context, final_input
828
+ else: # might be cross-ref
829
+ keyword_appear = (True, f)
830
+ cache[f] = context
831
+ small_output = merge_texts_skipping_overlap(output, context) + "\n"
832
+ chunkBFS = get_contextual_sentences_BFS(small_output, keyword)
833
+ countryBFS = model.get_country_from_text(chunkBFS)
834
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
835
+ output = merge_texts_skipping_overlap(output, final_input)
836
+ if countryDFS != "unknown" and countryBFS != "unknown":
837
+ if len(chunkDFS) <= len(chunkBFS):
838
+ return countryDFS, chunkDFS, output
839
+ else:
840
+ return countryBFS, chunkBFS, output
841
+ else:
842
+ if countryDFS != "unknown":
843
+ return countryDFS, chunkDFS, output
844
+ if countryBFS != "unknown":
845
+ return countryBFS, chunkBFS, output
846
+ else:
847
+ # scenerio 2:
848
+ '''cross-ref: ex: A1YU101 keyword in file 2 which includes KM1 but KM1 in file 1
849
+ but if we look at file 1 first then maybe we can have lookup dict which country
850
+ such as Thailand as the key and its re'''
851
+ cache[f] = final_input
852
+ if keyword_appear[0] == True:
853
+ for c in cache:
854
+ if c!=keyword_appear[1]:
855
+ if cache[c].lower() not in output.lower():
856
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
857
+ chunkBFS = get_contextual_sentences_BFS(output, keyword)
858
+ countryBFS = model.get_country_from_text(chunkBFS)
859
+ countryDFS, chunkDFS = get_contextual_sentences_DFS(output, keyword)
860
+ if countryDFS != "unknown" and countryBFS != "unknown":
861
+ if len(chunkDFS) <= len(chunkBFS):
862
+ return countryDFS, chunkDFS, output
863
+ else:
864
+ return countryBFS, chunkBFS, output
865
+ else:
866
+ if countryDFS != "unknown":
867
+ return countryDFS, chunkDFS, output
868
+ if countryBFS != "unknown":
869
+ return countryBFS, chunkBFS, output
870
+ else:
871
+ if cache[f].lower() not in output.lower():
872
+ output = merge_texts_skipping_overlap(output, cache[f]) + "\n"
873
+ if len(output) == 0 or keyword_appear[0]==False:
874
+ for c in cache:
875
+ if cache[c].lower() not in output.lower():
876
+ output = merge_texts_skipping_overlap(output, cache[c]) + "\n"
877
  return country, "", output