Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 3, 2025

Commit

e644b0b

verified ·

1 Parent(s): f26a27b

Update helpers/text_blocks.py

Browse files

Files changed (1) hide show

helpers/text_blocks.py +119 -25

helpers/text_blocks.py CHANGED Viewed

@@ -1,30 +1,124 @@
-# helpers/text_blocks.py
 from bs4 import BeautifulSoup
-from typing import List, Dict
-# كل العناصر النصية المسموح استخراجها بشكل مستقل
-ALLOWED_TEXT_TAGS = ["p", "li", "span", "div", "h1", "h2", "h3", "h4", "h5"]
-def extract_all_text_blocks(soup: BeautifulSoup) -> List[Dict[str, str]]:
     """
-    تعيد قائمة من القواميس بالشكل:
-    - {"text": "..."} لكل عنصر tag نصي (بدون فقدان أي جزء)
-    - {"image": "..."} لكل صورة
-    كل tag يعاد كنص مستقل باستخدام get_text لضمان عدم فقدان النصوص الفرعية.
     """
-    blocks: List[Dict[str, str]] = []
-    # استخراج الصور
-    for img_tag in soup.find_all("img"):
-        src = img_tag.get("src")
-        if src:
-            blocks.append({"image": src})
-    # استخراج النصوص من الـ tags المسموح بها
-    for tag in soup.find_all(ALLOWED_TEXT_TAGS):
-        text = tag.get_text(separator=" ", strip=True)
-        if text:
-            blocks.append({"text": text})
-    return blocks

+import requests
 from bs4 import BeautifulSoup
+from typing import List, Dict, Optional
+import re
+class TextExtractor:
+    def __init__(self):
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+    def extract_text_from_url(self, url: str, timeout: int = 10) -> List[Dict[str, str]]:
+        """
+        استخراج جميع النصوص من صفحة ويب
+        Args:
+            url: رابط الصفحة المراد تحليلها
+            timeout: مهلة الانتظار بالثواني
+        Returns:
+            قائمة تحتوي على النصوص في شكل {'text': 'النص'}
+        Raises:
+            requests.RequestException: في حالة فشل جلب الصفحة
+            Exception: في حالة فشل تحليل المحتوى
+        """
+        try:
+            # جلب محتوى الصفحة
+            response = requests.get(url, headers=self.headers, timeout=timeout)
+            response.raise_for_status()
+            # تحليل HTML
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # تنظيف HTML من العناصر غير المرغوب فيها
+            self._clean_html(soup)
+            # استخراج النصوص
+            text_elements = self._extract_texts(soup)
+            # تنظيف وتصفية النصوص
+            cleaned_texts = self._clean_and_filter_texts(text_elements)
+            # إزالة النصوص المكررة
+            unique_texts = self._remove_duplicates(cleaned_texts)
+            return unique_texts
+        except requests.RequestException as e:
+            raise requests.RequestException(f"Error fetching URL: {str(e)}")
+        except Exception as e:
+            raise Exception(f"Error processing content: {str(e)}")
+    def _clean_html(self, soup: BeautifulSoup) -> None:
+        """إزالة العناصر غير المرغوب فيها من HTML"""
+        unwanted_tags = ['script', 'style', 'meta', 'link', 'noscript', 'header', 'footer', 'nav']
+        for tag in unwanted_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+    def _extract_texts(self, soup: BeautifulSoup) -> List[str]:
+        """استخراج جميع النصوص من HTML"""
+        text_elements = []
+        for element in soup.find_all(text=True):
+            text = element.strip()
+            if text:
+                text_elements.append(text)
+        return text_elements
+    def _clean_and_filter_texts(self, texts: List[str]) -> List[str]:
+        """تنظيف وتصفية النصوص"""
+        cleaned_texts = []
+        for text in texts:
+            # تنظيف النص من المسافات الزائدة
+            cleaned_text = re.sub(r'\s+', ' ', text).strip()
+            # تصفية النصوص غير المرغوب فيها
+            if self._is_valid_text(cleaned_text):
+                cleaned_texts.append(cleaned_text)
+        return cleaned_texts
+    def _is_valid_text(self, text: str) -> bool:
+        """فحص صحة النص"""
+        # تجاهل النصوص الفارغة أو القصيرة جداً
+        if not text or len(text.strip()) < 2:
+            return False
+        # تجاهل النصوص التي تحتوي على مسافات فقط
+        if text.isspace():
+            return False
+        # تجاهل النصوص التي تحتوي على رموز خاصة فقط
+        if re.match(r'^[^\w\u0600-\u06FF]+$', text):
+            return False
+        return True
+    def _remove_duplicates(self, texts: List[str]) -> List[Dict[str, str]]:
+        """إزالة النصوص المكررة"""
+        seen = set()
+        unique_texts = []
+        for text in texts:
+            if text not in seen:
+                seen.add(text)
+                unique_texts.append({'text': text})
+        return unique_texts
+# دالة مستقلة للاستخدام السريع
+def extract_text_from_url(url: str, timeout: int = 10) -> List[Dict[str, str]]:
     """
+    دالة مستق��ة لاستخراج النصوص من رابط
+    Args:
+        url: رابط الصفحة
+        timeout: مهلة الانتظار
+    Returns:
+        قائمة النصوص المستخرجة
     """
+    extractor = TextExtractor()
+    return extractor.extract_text_from_url(url, timeout)