Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 3, 2025

Commit

12deac4

verified ·

1 Parent(s): 37ff93a

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -89

app.py CHANGED Viewed

@@ -1,98 +1,67 @@
-from fastapi import FastAPI, Query, HTTPException
-import httpx
 from bs4 import BeautifulSoup
-from typing import List, Dict, Any
-# تهيئة تطبيق FastAPI
-app = FastAPI(
-    title="محلل محتوى الصفحة (Page Content Analyzer)",
-    description="تطبيق لاستخراج النصوص من صفحة ويب معينة بناءً على رابط URL.",
-    version="1.0.0"
-)
-# قائمة بالوسوم (Tags) التي يجب تجاهل محتواها عادةً (مثل النصوص البرمجية والأنماط)
-# Ignore these common noise-generating tags
-IGNORE_TAGS = {
-    'script', 'style', 'noscript', 'head', 'meta', 'link', 'img', 'svg',
-    'button', 'input', 'select', 'textarea', 'nav', 'footer', 'header',
-    'form', 'iframe'
-}
-# دالة مساعدة لجلب وتحليل الرابط
-async def fetch_and_analyze(url: str) -> List[Dict[str, str]]:
-    """
-    تقوم بجلب محتوى الرابط، وتحليله، واستخراج النصوص الموجودة داخل وسوم HTML.
-    """
-    # تعيين مهلة قصوى للطلب (بالثواني)
-    TIMEOUT = 15
     try:
-        # 1. جلب المحتوى باستخدام عميل غير متزامن
-        # Fetch content using an async client
-        async with httpx.AsyncClient(timeout=TIMEOUT, follow_redirects=True) as client:
-            response = await client.get(url)
-            # إطلاق استثناء في حال وجود رمز حالة خطأ (4xx أو 5xx)
-            response.raise_for_status()
-            html_content = response.text
-        # 2. تحليل المحتوى باستخدام BeautifulSoup
-        # Parse content using BeautifulSoup
-        soup = BeautifulSoup(html_content, 'html.parser')
-        results: List[Dict[str, str]] = []
-        # Find all tags that might contain text
-        # البحث عن جميع الوسوم التي قد تحتوي على نص
-        for element in soup.find_all(True):
-            # التحقق مما إذا كان الوسم خارج قائمة التجاهل وليس جزءاً من وسم آخر تم استخلاصه
-            # Check if the tag is not in the ignore list
-            if element.name not in IGNORE_TAGS:
-                # الحصول على النص النظيف والمجرد من المسافات البيضاء الزائدة
-                # Get the clean, stripped text
-                text = element.get_text(strip=True, separator=' ')
-                # التحقق من أن النص ليس فارغاً
-                # Check if the text is meaningful
-                if text:
-                    # إضافة النص إلى قائمة النتائج بالتنسيق المطلوب: {'text': '...'}
-                    results.append({'text': text})
-        return results
-    except httpx.InvalidURL:
-        raise HTTPException(status_code=400, detail="الرابط غير صالح. يرجى التأكد من أن الرابط صحيح ويتضمن http:// أو https://.")
-    except httpx.ConnectTimeout:
-        raise HTTPException(status_code=504, detail="انتهت مهلة الاتصال بالرابط.")
-    except httpx.HTTPStatusError as e:
-        # Handling HTTP error status codes
-        raise HTTPException(status_code=e.response.status_code, detail=f"فشل في جلب الرابط: {e.response.status_code} - {e.response.reason_phrase}")
     except Exception as e:
-        # Catch other unexpected errors
-        raise HTTPException(status_code=500, detail=f"حدث خطأ غير متوقع أثناء التحليل: {type(e).__name__} - {str(e)}")
-@app.get(
-    "/analyze_url",
-    summary="تحليل محتوى الصفحة واستخراج النصوص",
-    description="يقوم بتحليل صفحة الويب المحددة واستخراج جميع النصوص الموجودة داخل وسوم HTML.",
-    response_model=List[Dict[str, str]]
-)
-async def get_page_content(
-    url: str = Query(
-        ...,
-        description="أدخل الرابط كاملاً (على سبيل المثال: https://www.google.com)",
-        example="https://www.google.com"
-    )
-):
-    """
-    المسار الرئيسي لاستقبال رابط URL وتنفيذ عملية التحليل.
-    - **url**: الرابط المراد تحليله.
-    - **النتيجة**: قائمة من القواميس، حيث يحتوي كل قاموس على نص تم استخلاصه من وسم: `{'text': 'النص المستخرج'}`.
-    """
-    # استدعاء دالة الجلب والتحليل
-    return await fetch_and_analyze(url)
-# معلومات تشغيل التطبيق (للتذكير)
-# To run this app, you would use: uvicorn app:app --reload
-# This instruction is for user reference only.

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, HttpUrl
+import requests
 from bs4 import BeautifulSoup
+from typing import List, Dict
+import re
+app = FastAPI(title="Text Extractor API", version="1.0.0")
+class URLRequest(BaseModel):
+    url: HttpUrl
+class TextResponse(BaseModel):
+    text: str
+@app.post("/extract-text", response_model=List[TextResponse])
+async def extract_text_from_url(request: URLRequest):
     try:
+        # إرسال طلب GET للرابط
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(str(request.url), headers=headers, timeout=10)
+        response.raise_for_status()
+        # تحليل HTML
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # إزالة العناصر غير المرغوب فيها
+        for element in soup(['script', 'style', 'meta', 'link', 'noscript']):
+            element.decompose()
+        # استخراج جميع النصوص من كل العناصر
+        text_elements = []
+        # البحث عن جميع العناصر التي تحتوي على نص
+        for element in soup.find_all(text=True):
+            text = element.strip()
+            # تنظيف النص وإزالة المسافات الزائدة
+            text = re.sub(r'\s+', ' ', text)
+            # تجاهل النصوص الفارغة والنصوص التي تحتوي على محارف خاصة فقط
+            if text and len(text) > 1 and not text.isspace():
+                text_elements.append({'text': text})
+        # إزالة النصوص المكررة
+        seen = set()
+        unique_texts = []
+        for item in text_elements:
+            if item['text'] not in seen:
+                seen.add(item['text'])
+                unique_texts.append(item)
+        return unique_texts
+    except requests.RequestException as e:
+        raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing content: {str(e)}")
+@app.get("/")
+async def root():
+    return {"message": "Text Extractor API is running! Send POST request to /extract-text"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)