Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +30 -0
scrape_3gpp.py
CHANGED
|
@@ -8,6 +8,7 @@ import zipfile
|
|
| 8 |
import textract
|
| 9 |
import gradio as gr
|
| 10 |
import shutil
|
|
|
|
| 11 |
|
| 12 |
def browse_folder(url):
|
| 13 |
if url.lower().endswith(('docs', 'docs/')):
|
|
@@ -297,6 +298,8 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 297 |
if file.endswith((".pptx", ".ppt", ".pdf", ".docx", ".doc", ".DOCX")):
|
| 298 |
try:
|
| 299 |
text = textract.process(file_path).decode('utf-8')
|
|
|
|
|
|
|
| 300 |
except Exception as e:
|
| 301 |
print(f"Error processing {file_path}: {e}")
|
| 302 |
errors_count += 1
|
|
@@ -419,6 +422,33 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 419 |
# Here's a simplified example
|
| 420 |
discussion_details = Discussion
|
| 421 |
extracted_content.append(discussion_details)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
# Add more categories as needed
|
| 423 |
contenu = "\n".join(extracted_content)
|
| 424 |
|
|
|
|
| 8 |
import textract
|
| 9 |
import gradio as gr
|
| 10 |
import shutil
|
| 11 |
+
from pypdf import PdfReader
|
| 12 |
|
| 13 |
def browse_folder(url):
|
| 14 |
if url.lower().endswith(('docs', 'docs/')):
|
|
|
|
| 298 |
if file.endswith((".pptx", ".ppt", ".pdf", ".docx", ".doc", ".DOCX")):
|
| 299 |
try:
|
| 300 |
text = textract.process(file_path).decode('utf-8')
|
| 301 |
+
if file.endswith((".pdf")):
|
| 302 |
+
pdfReader = PdfReader(file_path)
|
| 303 |
except Exception as e:
|
| 304 |
print(f"Error processing {file_path}: {e}")
|
| 305 |
errors_count += 1
|
|
|
|
| 422 |
# Here's a simplified example
|
| 423 |
discussion_details = Discussion
|
| 424 |
extracted_content.append(discussion_details)
|
| 425 |
+
|
| 426 |
+
elif category == "pdf":
|
| 427 |
+
tabLine = []
|
| 428 |
+
file = pdfReader
|
| 429 |
+
pdfNumberPages = len(file.pages)
|
| 430 |
+
for pdfPage in range(0, pdfNumberPages):
|
| 431 |
+
|
| 432 |
+
load_page = file.get_page(pdfPage)
|
| 433 |
+
text = load_page.extract_text()
|
| 434 |
+
lines = text.split("\n")
|
| 435 |
+
|
| 436 |
+
keyword = ["objective", "introduction", "summary", "scope"]
|
| 437 |
+
for line in lines:
|
| 438 |
+
print(line)
|
| 439 |
+
if len(line) < 20:
|
| 440 |
+
for key in keyword:
|
| 441 |
+
line = line.lower()
|
| 442 |
+
if key in line:
|
| 443 |
+
start_index = line.find(key)
|
| 444 |
+
selectedText = lines[start_index:]
|
| 445 |
+
|
| 446 |
+
tabLine.append([pdfPage,selectedText,key])
|
| 447 |
+
print(f"Selected line in keywords is: {line}")
|
| 448 |
+
for r in tabLine:
|
| 449 |
+
extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
|
| 450 |
+
extracted_content.append(' '.join(r[1]))
|
| 451 |
+
|
| 452 |
# Add more categories as needed
|
| 453 |
contenu = "\n".join(extracted_content)
|
| 454 |
|