Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import json
|
|
| 6 |
import concurrent.futures
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
| 9 |
from langdetect import detect_langs
|
| 10 |
from PyPDF2 import PdfReader
|
| 11 |
from io import BytesIO
|
|
@@ -65,7 +66,7 @@ def download_pdf(url, timeout=10):
|
|
| 65 |
logging.error(f"PDF download error: {e}")
|
| 66 |
return None
|
| 67 |
|
| 68 |
-
def
|
| 69 |
reader = PdfReader(pdf_file)
|
| 70 |
extracted_text = ""
|
| 71 |
try:
|
|
@@ -80,19 +81,35 @@ def extract_text_from_pages(pdf_file, pages):
|
|
| 80 |
logging.error(f"PDF text extraction error: {e}")
|
| 81 |
return 'हे चालत नाही'
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
def process_link(link, similar_product):
|
| 84 |
if link in seen:
|
| 85 |
return None
|
| 86 |
seen.add(link)
|
| 87 |
try:
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
| 96 |
return None
|
| 97 |
|
| 98 |
def filtering(urls, similar_product):
|
|
|
|
| 6 |
import concurrent.futures
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
from langchain_community.document_loaders import PyPDFLoader
|
| 9 |
+
from langchain_community.document_loaders import WebBaseLoader
|
| 10 |
from langdetect import detect_langs
|
| 11 |
from PyPDF2 import PdfReader
|
| 12 |
from io import BytesIO
|
|
|
|
| 66 |
logging.error(f"PDF download error: {e}")
|
| 67 |
return None
|
| 68 |
|
| 69 |
+
def extract_text_from_pdf(pdf_file, pages):
|
| 70 |
reader = PdfReader(pdf_file)
|
| 71 |
extracted_text = ""
|
| 72 |
try:
|
|
|
|
| 81 |
logging.error(f"PDF text extraction error: {e}")
|
| 82 |
return 'हे चालत नाही'
|
| 83 |
|
| 84 |
+
def extract_text_online(link):
|
| 85 |
+
|
| 86 |
+
loader = WebBaseLoader(link)
|
| 87 |
+
pages = loader.load_and_split()
|
| 88 |
+
|
| 89 |
+
text = ''
|
| 90 |
+
|
| 91 |
+
for page in pages[:3]:
|
| 92 |
+
text+=page.page_content
|
| 93 |
+
|
| 94 |
+
return text
|
| 95 |
+
|
| 96 |
+
|
| 97 |
def process_link(link, similar_product):
|
| 98 |
if link in seen:
|
| 99 |
return None
|
| 100 |
seen.add(link)
|
| 101 |
try:
|
| 102 |
+
if link[-3:]=='.md':
|
| 103 |
+
text = extract_text_online(link)
|
| 104 |
+
else:
|
| 105 |
+
pdf_file = download_pdf(link)
|
| 106 |
+
text = extract_text_from_pdf(pdf_file, [0, 2, 4])
|
| 107 |
+
|
| 108 |
+
if language_preprocess(text):
|
| 109 |
+
if relevant(main_product, similar_product, text):
|
| 110 |
+
return link
|
| 111 |
+
except:
|
| 112 |
+
pass
|
| 113 |
return None
|
| 114 |
|
| 115 |
def filtering(urls, similar_product):
|