Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +10 -5
pipeline.py
CHANGED
|
@@ -287,13 +287,18 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
|
|
| 287 |
asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
|
| 288 |
timeout=10
|
| 289 |
)
|
|
|
|
|
|
|
| 290 |
except Exception:
|
| 291 |
tables_link = []
|
| 292 |
|
| 293 |
# --- merge ---
|
| 294 |
try:
|
| 295 |
print("just merge text and tables")
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
| 297 |
except Exception:
|
| 298 |
print("no succeed here in preprocess docu")
|
| 299 |
final_input_link = ""
|
|
@@ -306,9 +311,9 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
|
|
| 306 |
if len(final_input_link) > 1000000:
|
| 307 |
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
|
| 308 |
if len(final_input_link) > 1000000:
|
| 309 |
-
final_input_link = final_input_link[:
|
| 310 |
|
| 311 |
-
all_output
|
| 312 |
|
| 313 |
return context, all_output, chunk
|
| 314 |
|
|
@@ -802,8 +807,8 @@ async def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None, save
|
|
| 802 |
|
| 803 |
# combine results
|
| 804 |
for context, new_all_output, new_chunk in results:
|
| 805 |
-
all_output
|
| 806 |
-
chunk
|
| 807 |
# for link in links:
|
| 808 |
# print(link)
|
| 809 |
# # if len(all_output) > 1000*1000:
|
|
|
|
| 287 |
asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
|
| 288 |
timeout=10
|
| 289 |
)
|
| 290 |
+
print("this is table link: ", str(table_links))
|
| 291 |
+
|
| 292 |
except Exception:
|
| 293 |
tables_link = []
|
| 294 |
|
| 295 |
# --- merge ---
|
| 296 |
try:
|
| 297 |
print("just merge text and tables")
|
| 298 |
+
try:
|
| 299 |
+
final_input_link = text_link + ", ".join(tables_link)
|
| 300 |
+
except:
|
| 301 |
+
final_input_link = str(text_link) + str(tables_link)
|
| 302 |
except Exception:
|
| 303 |
print("no succeed here in preprocess docu")
|
| 304 |
final_input_link = ""
|
|
|
|
| 311 |
if len(final_input_link) > 1000000:
|
| 312 |
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
|
| 313 |
if len(final_input_link) > 1000000:
|
| 314 |
+
final_input_link = final_input_link[:1000000]
|
| 315 |
|
| 316 |
+
all_output += data_preprocess.normalize_for_overlap(all_output) + final_input_link
|
| 317 |
|
| 318 |
return context, all_output, chunk
|
| 319 |
|
|
|
|
| 807 |
|
| 808 |
# combine results
|
| 809 |
for context, new_all_output, new_chunk in results:
|
| 810 |
+
all_output += new_all_output
|
| 811 |
+
chunk += new_chunk
|
| 812 |
# for link in links:
|
| 813 |
# print(link)
|
| 814 |
# # if len(all_output) > 1000*1000:
|