Karim shoair commited on
Commit
2b65543
·
1 Parent(s): 4da1914

fix(storage): possible threading issue with recursion + optimizations

Browse files
Files changed (1) hide show
  1. scrapling/core/storage_adaptors.py +12 -12
scrapling/core/storage_adaptors.py CHANGED
@@ -1,14 +1,15 @@
1
- import sqlite3
2
- import threading
3
  from abc import ABC, abstractmethod
4
  from hashlib import sha256
 
5
 
6
- import orjson
7
- from lxml import html
8
  from tldextract import extract as tld
9
 
 
10
  from scrapling.core._types import Dict, Optional, Union
11
- from scrapling.core.utils import _StorageTools, log, lru_cache
12
 
13
 
14
  class StorageSystemMixin(ABC):
@@ -35,7 +36,7 @@ class StorageSystemMixin(ABC):
35
  return default_value
36
 
37
  @abstractmethod
38
- def save(self, element: html.HtmlElement, identifier: str) -> None:
39
  """Saves the element's unique properties to the storage for retrieval and relocation later
40
 
41
  :param element: The element itself which we want to save to storage.
@@ -81,11 +82,10 @@ class SQLiteStorageSystem(StorageSystemMixin):
81
  """
82
  super().__init__(url)
83
  self.storage_file = storage_file
84
- # We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
85
- self.lock = threading.Lock()
86
  # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
87
  # `check_same_thread=False` to allow it to be used across different threads.
88
- self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
89
  # WAL (Write-Ahead Logging) allows for better concurrency.
90
  self.connection.execute("PRAGMA journal_mode=WAL")
91
  self.cursor = self.connection.cursor()
@@ -106,7 +106,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
106
  """)
107
  self.connection.commit()
108
 
109
- def save(self, element: html.HtmlElement, identifier: str):
110
  """Saves the elements unique properties to the storage for retrieval and relocation later
111
 
112
  :param element: The element itself which we want to save to storage.
@@ -121,7 +121,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
121
  INSERT OR REPLACE INTO storage (url, identifier, element_data)
122
  VALUES (?, ?, ?)
123
  """,
124
- (url, identifier, orjson.dumps(element_data)),
125
  )
126
  self.cursor.fetchall()
127
  self.connection.commit()
@@ -141,7 +141,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
141
  )
142
  result = self.cursor.fetchone()
143
  if result:
144
- return orjson.loads(result[0])
145
  return None
146
 
147
  def close(self):
 
1
+ from sqlite3 import connect as db_connect
2
+ from threading import RLock
3
  from abc import ABC, abstractmethod
4
  from hashlib import sha256
5
+ from functools import lru_cache
6
 
7
+ from lxml.html import HtmlElement
8
+ from orjson import dumps, loads
9
  from tldextract import extract as tld
10
 
11
+ from scrapling.core.utils import _StorageTools, log
12
  from scrapling.core._types import Dict, Optional, Union
 
13
 
14
 
15
  class StorageSystemMixin(ABC):
 
36
  return default_value
37
 
38
  @abstractmethod
39
+ def save(self, element: HtmlElement, identifier: str) -> None:
40
  """Saves the element's unique properties to the storage for retrieval and relocation later
41
 
42
  :param element: The element itself which we want to save to storage.
 
82
  """
83
  super().__init__(url)
84
  self.storage_file = storage_file
85
+ self.lock = RLock() # Better than Lock for reentrancy
 
86
  # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
87
  # `check_same_thread=False` to allow it to be used across different threads.
88
+ self.connection = db_connect(self.storage_file, check_same_thread=False)
89
  # WAL (Write-Ahead Logging) allows for better concurrency.
90
  self.connection.execute("PRAGMA journal_mode=WAL")
91
  self.cursor = self.connection.cursor()
 
106
  """)
107
  self.connection.commit()
108
 
109
+ def save(self, element: HtmlElement, identifier: str):
110
  """Saves the elements unique properties to the storage for retrieval and relocation later
111
 
112
  :param element: The element itself which we want to save to storage.
 
121
  INSERT OR REPLACE INTO storage (url, identifier, element_data)
122
  VALUES (?, ?, ?)
123
  """,
124
+ (url, identifier, dumps(element_data)),
125
  )
126
  self.cursor.fetchall()
127
  self.connection.commit()
 
141
  )
142
  result = self.cursor.fetchone()
143
  if result:
144
+ return loads(result[0])
145
  return None
146
 
147
  def close(self):