Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 3, 2025

Commit

37ff93a

verified ·

1 Parent(s): 61469f8

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -151

app.py CHANGED Viewed

@@ -1,168 +1,98 @@
-import requests
 from bs4 import BeautifulSoup
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel, HttpUrl
-from typing import List, Dict, Optional
-import urllib.parse
-from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI(
-    title="Web Text Extractor API",
-    description="API لاستخراج النصوص من صفحات الويب",
     version="1.0.0"
 )
-# إضافة CORS للسماح بالوصول من أي مصدر
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# نموذج بيانات للإدخال
-class URLRequest(BaseModel):
-    url: HttpUrl
-    include_empty_text: Optional[bool] = False
-    specific_tags: Optional[List[str]] = None
-    exclude_tags: Optional[List[str]] = None
-def is_valid_url(url: str) -> bool:
-    """التحقق من صحة الرابط"""
     try:
-        result = urllib.parse.urlparse(url)
-        return all([result.scheme, result.netloc])
-    except:
-        return False
-def fetch_webpage(url: str) -> str:
-    """جلب محتوى صفحة الويب"""
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    }
-    try:
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
-        return response.text
-    except requests.exceptions.RequestException as e:
-        raise HTTPException(status_code=400, detail=f"فشل في جلب الصفحة: {str(e)}")
-def extract_text_from_tags(html_content: str, include_empty: bool = False,
-                          specific_tags: List[str] = None, exclude_tags: List[str] = None) -> List[Dict[str, str]]:
-    """استخراج النصوص من جميع العلامات"""
-    soup = BeautifulSoup(html_content, 'html.parser')
-    # تحديد العلامات المراد استخراج النصوص منها
-    if specific_tags:
-        tags = soup.find_all(specific_tags)
-    else:
-        # استخراج جميع العلامات التي يمكن أن تحتوي على نص
-        tags = soup.find_all(True)
-    # فلترة العلامات المستبعدة
-    if exclude_tags:
-        tags = [tag for tag in tags if tag.name not in exclude_tags]
-    result = []
-    for tag in tags:
-        # الحصول على النص مع تنظيفه
-        text = tag.get_text(strip=True, separator=' ')
-        # إذا كان النص فارغاً ولا نريد تضمينه
-        if not text and not include_empty:
-            continue
-        # إضافة النتيجة إلى القائمة
-        result.append({
-            'text': text if text else '',
-            'tag': tag.name,
-            'attrs': dict(tag.attrs) if tag.attrs else {}
-        })
-    return result
-@app.get("/")
-async def root():
-    """الصفحة الرئيسية"""
-    return {
-        "message": "مرحباً بك في API استخراج نصوص صفحات الويب",
-        "endpoints": {
-            "POST /extract": "استخراج النصوص من رابط URL",
-            "GET /health": "فحص حالة الخدمة"
-        },
-        "usage": {
-            "example_request": {
-                "url": "https://example.com",
-                "include_empty_text": False,
-                "specific_tags": ["p", "h1", "h2", "h3"],
-                "exclude_tags": ["script", "style"]
-            }
-        }
-    }
-@app.post("/parse", response_model=List[Dict[str, str]])
-async def extract_text(request: URLRequest):
-    """
-    استخراج النصوص من صفحة ويب
-    - **url**: رابط الصفحة المراد تحليلها
-    - **include_empty_text**: تضمين العلامات الفارغة (افتراضي: False)
-    - **specific_tags**: قائمة بعلامات محددة لاستخراج النصوص منها (افتراضي: جميع العلامات)
-    - **exclude_tags**: قائمة بعلامات لتجاهلها (مثل: script, style)
-    """
-    # جلب محتوى الصفحة
-    html_content = fetch_webpage(str(request.url))
-    # استخراج النصوص
-    extracted_texts = extract_text_from_tags(
-        html_content,
-        include_empty=request.include_empty_text,
-        specific_tags=request.specific_tags,
-        exclude_tags=request.exclude_tags or ["script", "style", "meta", "link", "noscript"]
-    )
-    return extracted_texts
-@app.get("/parse")
-async def extract_text_get(url: str, include_empty: bool = False):
-    """نسخة GET من استخراج النصوص (للسهولة)"""
-    if not is_valid_url(url):
-        raise HTTPException(status_code=400, detail="رابط غير صالح")
-    html_content = fetch_webpage(url)
-    extracted_texts = extract_text_from_tags(html_content, include_empty)
-    return extracted_texts
-@app.get("/health")
-async def health_check():
-    """فحص حالة الخدمة"""
-    return {"status": "healthy", "service": "web-text-extractor"}
-@app.get("/tags-info")
-async def tags_info():
-    """معلومات عن العلامات الشائعة التي تحتوي على نص"""
-    common_tags = {
-        "headings": ["h1", "h2", "h3", "h4", "h5", "h6"],
-        "paragraphs": ["p"],
-        "lists": ["li", "dt", "dd"],
-        "text_formatting": ["span", "strong", "em", "b", "i", "mark", "small"],
-        "links": ["a"],
-        "tables": ["td", "th", "caption"],
-        "quotes": ["blockquote", "q"],
-        "other": ["div", "section", "article", "header", "footer", "nav", "main", "aside"]
-    }
-    return {
-        "info": "العلامات الشائ��ة التي تحتوي على نص في صفحات الويب",
-        "common_tags": common_tags,
-        "excluded_by_default": ["script", "style", "meta", "link", "noscript", "img", "br", "hr"]
-    }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+from fastapi import FastAPI, Query, HTTPException
+import httpx
 from bs4 import BeautifulSoup
+from typing import List, Dict, Any
+# تهيئة تطبيق FastAPI
 app = FastAPI(
+    title="محلل محتوى الصفحة (Page Content Analyzer)",
+    description="تطبيق لاستخراج النصوص من صفحة ويب معينة بناءً على رابط URL.",
     version="1.0.0"
 )
+# قائمة بالوسوم (Tags) التي يجب تجاهل محتواها عادةً (مثل النصوص البرمجية والأنماط)
+# Ignore these common noise-generating tags
+IGNORE_TAGS = {
+    'script', 'style', 'noscript', 'head', 'meta', 'link', 'img', 'svg',
+    'button', 'input', 'select', 'textarea', 'nav', 'footer', 'header',
+    'form', 'iframe'
+}
+# دالة مساعدة لجلب وتحليل الرابط
+async def fetch_and_analyze(url: str) -> List[Dict[str, str]]:
+    """
+    تقوم بجلب محتوى الرابط، وتحليله، واستخراج النصوص الموجودة داخل وسوم HTML.
+    """
+    # تعيين مهلة قصوى للطلب (بالثواني)
+    TIMEOUT = 15
     try:
+        # 1. جلب المحتوى باستخدام عميل غير متزامن
+        # Fetch content using an async client
+        async with httpx.AsyncClient(timeout=TIMEOUT, follow_redirects=True) as client:
+            response = await client.get(url)
+            # إطلاق استثناء في حال وجود رمز حالة خطأ (4xx أو 5xx)
+            response.raise_for_status()
+            html_content = response.text
+        # 2. تحليل المحتوى باستخدام BeautifulSoup
+        # Parse content using BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        results: List[Dict[str, str]] = []
+        # Find all tags that might contain text
+        # البحث عن جميع الوسوم التي قد تحتوي على نص
+        for element in soup.find_all(True):
+            # التحقق مما إذا كان الوسم خارج قائمة التجاهل وليس جزءاً من وسم آخر تم استخلاصه
+            # Check if the tag is not in the ignore list
+            if element.name not in IGNORE_TAGS:
+                # الحصول على النص النظيف والمجرد من المسافات البيضاء الزائدة
+                # Get the clean, stripped text
+                text = element.get_text(strip=True, separator=' ')
+                # التحقق من أن النص ليس فارغاً
+                # Check if the text is meaningful
+                if text:
+                    # إضافة النص إلى قائمة النتائج بالتنسيق المطلوب: {'text': '...'}
+                    results.append({'text': text})
+        return results
+    except httpx.InvalidURL:
+        raise HTTPException(status_code=400, detail="الرابط غير صالح. يرجى التأكد من أن الرابط صحيح ويتضمن http:// أو https://.")
+    except httpx.ConnectTimeout:
+        raise HTTPException(status_code=504, detail="انتهت مهلة الاتصال بالرابط.")
+    except httpx.HTTPStatusError as e:
+        # Handling HTTP error status codes
+        raise HTTPException(status_code=e.response.status_code, detail=f"فشل في جلب الرابط: {e.response.status_code} - {e.response.reason_phrase}")
+    except Exception as e:
+        # Catch other unexpected errors
+        raise HTTPException(status_code=500, detail=f"حدث خطأ غير متوقع أثناء التحليل: {type(e).__name__} - {str(e)}")
+@app.get(
+    "/analyze_url",
+    summary="تحليل محتوى الصفحة واستخراج النصوص",
+    description="يقوم بتحليل صفحة الويب المحددة واستخراج جميع النصوص الموجودة داخل وسوم HTML.",
+    response_model=List[Dict[str, str]]
+)
+async def get_page_content(
+    url: str = Query(
+        ...,
+        description="أدخل الرابط كاملاً (على سبيل المثال: https://www.google.com)",
+        example="https://www.google.com"
+    )
+):
+    """
+    المسار الرئيسي لاستقبال رابط URL وتنفيذ عملية التحليل.
+    - **url**: الرابط المراد تحليله.
+    - **النتيجة**: قائمة من القواميس، حيث يحتوي كل قاموس على نص تم استخلاصه من وسم: `{'text': 'النص المستخرج'}`.
+    """
+    # استدعاء دالة الجلب والتحليل
+    return await fetch_and_analyze(url)
+# معلومات تشغيل التطبيق (للتذكير)
+# To run this app, you would use: uvicorn app:app --reload
+# This instruction is for user reference only.