Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +102 -25
scrape_3gpp.py
CHANGED
|
@@ -449,31 +449,108 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 449 |
extracted_content.append(discussion_details)
|
| 450 |
|
| 451 |
elif category == "pdf":
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
# Add more categories as needed
|
| 478 |
contenu = "\n".join(extracted_content)
|
| 479 |
|
|
|
|
| 449 |
extracted_content.append(discussion_details)
|
| 450 |
|
| 451 |
elif category == "pdf":
|
| 452 |
+
try:
|
| 453 |
+
tabLine = []
|
| 454 |
+
file = pdfReader
|
| 455 |
+
pdfNumberPages = len(file.pages)
|
| 456 |
+
for pdfPage in range(0, pdfNumberPages):
|
| 457 |
+
|
| 458 |
+
load_page = file.get_page(pdfPage)
|
| 459 |
+
text = load_page.extract_text()
|
| 460 |
+
lines = text.split("\n")
|
| 461 |
+
sizeOfLines = len(lines) - 1
|
| 462 |
+
keyword = ["objective", "introduction", "summary", "scope"]
|
| 463 |
+
|
| 464 |
+
for index, line in enumerate(lines):
|
| 465 |
+
print(line)
|
| 466 |
+
for key in keyword:
|
| 467 |
+
line = line.lower()
|
| 468 |
+
|
| 469 |
+
if key in line:
|
| 470 |
+
print("Found keyword")
|
| 471 |
+
lineBool = True
|
| 472 |
+
lineIndex = index
|
| 473 |
+
previousSelectedLines = []
|
| 474 |
+
stringLength = 0
|
| 475 |
+
linesForSelection = lines
|
| 476 |
+
loadOnce = True
|
| 477 |
+
selectedPdfPage = pdfPage
|
| 478 |
+
|
| 479 |
+
while lineBool:
|
| 480 |
+
print(lineIndex)
|
| 481 |
+
if stringLength > words_limit or lineIndex < 0:
|
| 482 |
+
lineBool = False
|
| 483 |
+
else:
|
| 484 |
+
if lineIndex == 0:
|
| 485 |
+
print(f"Line index == 0")
|
| 486 |
+
|
| 487 |
+
if pdfPage == 0:
|
| 488 |
+
lineBool = False
|
| 489 |
+
|
| 490 |
+
else:
|
| 491 |
+
try:
|
| 492 |
+
selectedPdfPage -= 1
|
| 493 |
+
newLoad_page = file.get_page(selectedPdfPage)
|
| 494 |
+
newText = newLoad_page.extract_text()
|
| 495 |
+
newLines = newText.split("\n")
|
| 496 |
+
linesForSelection = newLines
|
| 497 |
+
print(f"len newLines{len(newLines)}")
|
| 498 |
+
lineIndex = len(newLines) - 1
|
| 499 |
+
except Exception as e:
|
| 500 |
+
print(f"Loading previous PDF page failed")
|
| 501 |
+
lineBool = False
|
| 502 |
+
|
| 503 |
+
previousSelectedLines.append(linesForSelection[lineIndex])
|
| 504 |
+
stringLength += len(linesForSelection[lineIndex])
|
| 505 |
+
|
| 506 |
+
lineIndex -= 1
|
| 507 |
+
previousSelectedLines = ' '.join(previousSelectedLines[::-1])
|
| 508 |
+
|
| 509 |
+
lineBool = True
|
| 510 |
+
lineIndex = index + 1
|
| 511 |
+
nextSelectedLines = ""
|
| 512 |
+
linesForSelection = lines
|
| 513 |
+
loadOnce = True
|
| 514 |
+
selectedPdfPage = pdfPage
|
| 515 |
+
|
| 516 |
+
while lineBool:
|
| 517 |
+
|
| 518 |
+
if len(nextSelectedLines.split()) > words_limit:
|
| 519 |
+
lineBool = False
|
| 520 |
+
else:
|
| 521 |
+
if lineIndex > sizeOfLines:
|
| 522 |
+
lineBool = False
|
| 523 |
+
|
| 524 |
+
if pdfPage == pdfNumberPages - 1:
|
| 525 |
+
lineBool = False
|
| 526 |
+
|
| 527 |
+
else:
|
| 528 |
+
try:
|
| 529 |
+
selectedPdfPage += 1
|
| 530 |
+
newLoad_page = file.get_page(selectedPdfPage)
|
| 531 |
+
newText = newLoad_page.extract_text()
|
| 532 |
+
newLines = newText.split("\n")
|
| 533 |
+
linesForSelection = newLines
|
| 534 |
+
lineIndex = 0
|
| 535 |
+
except Exception as e:
|
| 536 |
+
print(f"Loading next PDF page failed")
|
| 537 |
+
lineBool = False
|
| 538 |
+
else:
|
| 539 |
+
nextSelectedLines += " " + linesForSelection[lineIndex]
|
| 540 |
+
lineIndex += 1
|
| 541 |
+
|
| 542 |
+
print(f"Previous Lines : {previousSelectedLines}")
|
| 543 |
+
print(f"Next Lines : {nextSelectedLines}")
|
| 544 |
+
selectedText = previousSelectedLines + ' ' + nextSelectedLines
|
| 545 |
+
print(selectedText)
|
| 546 |
+
tabLine.append([pdfPage, selectedText, key])
|
| 547 |
+
print(f"Selected line in keywords is: {line}")
|
| 548 |
+
|
| 549 |
+
for r in tabLine:
|
| 550 |
+
extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
|
| 551 |
+
extracted_content.append(''.join(r[1]))
|
| 552 |
+
except Exception as e:
|
| 553 |
+
print(f"Error occured while extracting PDF content : {e}")
|
| 554 |
# Add more categories as needed
|
| 555 |
contenu = "\n".join(extracted_content)
|
| 556 |
|