Karim shoair commited on
Commit ·
2b65543
1
Parent(s): 4da1914
fix(storage): possible threading issue with recursion + optimizations
Browse files
scrapling/core/storage_adaptors.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
from abc import ABC, abstractmethod
|
| 4 |
from hashlib import sha256
|
|
|
|
| 5 |
|
| 6 |
-
import
|
| 7 |
-
from
|
| 8 |
from tldextract import extract as tld
|
| 9 |
|
|
|
|
| 10 |
from scrapling.core._types import Dict, Optional, Union
|
| 11 |
-
from scrapling.core.utils import _StorageTools, log, lru_cache
|
| 12 |
|
| 13 |
|
| 14 |
class StorageSystemMixin(ABC):
|
|
@@ -35,7 +36,7 @@ class StorageSystemMixin(ABC):
|
|
| 35 |
return default_value
|
| 36 |
|
| 37 |
@abstractmethod
|
| 38 |
-
def save(self, element:
|
| 39 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 40 |
|
| 41 |
:param element: The element itself which we want to save to storage.
|
|
@@ -81,11 +82,10 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 81 |
"""
|
| 82 |
super().__init__(url)
|
| 83 |
self.storage_file = storage_file
|
| 84 |
-
|
| 85 |
-
self.lock = threading.Lock()
|
| 86 |
# >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
| 87 |
# `check_same_thread=False` to allow it to be used across different threads.
|
| 88 |
-
self.connection =
|
| 89 |
# WAL (Write-Ahead Logging) allows for better concurrency.
|
| 90 |
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 91 |
self.cursor = self.connection.cursor()
|
|
@@ -106,7 +106,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 106 |
""")
|
| 107 |
self.connection.commit()
|
| 108 |
|
| 109 |
-
def save(self, element:
|
| 110 |
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 111 |
|
| 112 |
:param element: The element itself which we want to save to storage.
|
|
@@ -121,7 +121,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 121 |
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 122 |
VALUES (?, ?, ?)
|
| 123 |
""",
|
| 124 |
-
(url, identifier,
|
| 125 |
)
|
| 126 |
self.cursor.fetchall()
|
| 127 |
self.connection.commit()
|
|
@@ -141,7 +141,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 141 |
)
|
| 142 |
result = self.cursor.fetchone()
|
| 143 |
if result:
|
| 144 |
-
return
|
| 145 |
return None
|
| 146 |
|
| 147 |
def close(self):
|
|
|
|
| 1 |
+
from sqlite3 import connect as db_connect
|
| 2 |
+
from threading import RLock
|
| 3 |
from abc import ABC, abstractmethod
|
| 4 |
from hashlib import sha256
|
| 5 |
+
from functools import lru_cache
|
| 6 |
|
| 7 |
+
from lxml.html import HtmlElement
|
| 8 |
+
from orjson import dumps, loads
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
+
from scrapling.core.utils import _StorageTools, log
|
| 12 |
from scrapling.core._types import Dict, Optional, Union
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
|
|
|
| 36 |
return default_value
|
| 37 |
|
| 38 |
@abstractmethod
|
| 39 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 40 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 41 |
|
| 42 |
:param element: The element itself which we want to save to storage.
|
|
|
|
| 82 |
"""
|
| 83 |
super().__init__(url)
|
| 84 |
self.storage_file = storage_file
|
| 85 |
+
self.lock = RLock() # Better than Lock for reentrancy
|
|
|
|
| 86 |
# >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
| 87 |
# `check_same_thread=False` to allow it to be used across different threads.
|
| 88 |
+
self.connection = db_connect(self.storage_file, check_same_thread=False)
|
| 89 |
# WAL (Write-Ahead Logging) allows for better concurrency.
|
| 90 |
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 91 |
self.cursor = self.connection.cursor()
|
|
|
|
| 106 |
""")
|
| 107 |
self.connection.commit()
|
| 108 |
|
| 109 |
+
def save(self, element: HtmlElement, identifier: str):
|
| 110 |
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 111 |
|
| 112 |
:param element: The element itself which we want to save to storage.
|
|
|
|
| 121 |
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 122 |
VALUES (?, ?, ?)
|
| 123 |
""",
|
| 124 |
+
(url, identifier, dumps(element_data)),
|
| 125 |
)
|
| 126 |
self.cursor.fetchall()
|
| 127 |
self.connection.commit()
|
|
|
|
| 141 |
)
|
| 142 |
result = self.cursor.fetchone()
|
| 143 |
if result:
|
| 144 |
+
return loads(result[0])
|
| 145 |
return None
|
| 146 |
|
| 147 |
def close(self):
|