Spaces:

Mazenbs
/

extract_html_full

Sleeping

App Files Files Community

Mazenbs commited on Dec 3, 2025

Commit

1aa9765

verified ·

1 Parent(s): 4bb3b2b

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -82

app.py CHANGED Viewed

@@ -1,85 +1,168 @@
-# app.py
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from services.fetcher import fetch_html
-from parser.assembler import parse_law_from_html
-from helpers.text_blocks import extract_all_text_blocks
 from bs4 import BeautifulSoup
-from starlette.concurrency import run_in_threadpool
-from supabase_utils import save_law_to_supabase
-app = FastAPI(title="Law Extractor API", version="1.0")
-# -----------------------------
-# نماذج البيانات
-# -----------------------------
-class ParseRequest(BaseModel):
-    url: str
-    save_to_supabase: bool = False
-    response_format: str = "law_json"  # "law_json" أو "text_blocks"
-class LawJsonResponse(BaseModel):
-    title: str
-    preamble: str
-    sections: list
-class TextBlocksResponse(BaseModel):
-    text_blocks: list
-# -----------------------------
-# نقطة النهاية
-# -----------------------------
-@app.post("/parse")
-async def parse_law(request: ParseRequest):
-    url = request.url
-    save_flag = request.save_to_supabase
-    response_format = request.response_format.lower()
-    # 1) جلب HTML
     try:
-        html = await fetch_html(url)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"فشل تحميل الصفحة: {e}")
-    # 2) إنشاء BeautifulSoup
-    soup = BeautifulSoup(html, "html.parser")
-    # 3) إذا اختار المستخدم text_blocks
-    if response_format == "text_blocks":
-        blocks = extract_all_text_blocks(soup)
-        # إزالة التكرارات فقط
-        seen = set()
-        clean_blocks = []
-        for blk in blocks:
-            if "text" in blk:
-                if blk["text"] not in seen:
-                    clean_blocks.append(blk)
-                    seen.add(blk["text"])
-            else:
-                clean_blocks.append(blk)
-        return {"text_blocks": clean_blocks}
-    # 4) خلاف ذلك: تحليل كامل إلى law_json
     try:
-        law_json = await run_in_threadpool(parse_law_from_html, html)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"خطأ أثناء التحليل: {e}")
-    # 5) الحفظ في Supabase عند الطلب
-    if save_flag:
-        try:
-            await run_in_threadpool(save_law_to_supabase, law_json)
-        except Exception as e:
-            raise HTTPException(
-                status_code=500,
-                detail=f"تم التحليل بنجاح ولكن فشل الحفظ في Supabase: {e}"
-            )
-    return law_json

+import requests
 from bs4 import BeautifulSoup
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, HttpUrl
+from typing import List, Dict, Optional
+import urllib.parse
+from fastapi.middleware.cors import CORSMiddleware
+app = FastAPI(
+    title="Web Text Extractor API",
+    description="API لاستخراج النصوص من صفحات الويب",
+    version="1.0.0"
+)
+# إضافة CORS للسماح بالوصول من أي مصدر
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# نموذج بيانات للإدخال
+class URLRequest(BaseModel):
+    url: HttpUrl
+    include_empty_text: Optional[bool] = False
+    specific_tags: Optional[List[str]] = None
+    exclude_tags: Optional[List[str]] = None
+def is_valid_url(url: str) -> bool:
+    """التحقق من صحة الرابط"""
     try:
+        result = urllib.parse.urlparse(url)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+def fetch_webpage(url: str) -> str:
+    """جلب محتوى صفحة الويب"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
     try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=400, detail=f"فشل في جلب الصفحة: {str(e)}")
+def extract_text_from_tags(html_content: str, include_empty: bool = False,
+                          specific_tags: List[str] = None, exclude_tags: List[str] = None) -> List[Dict[str, str]]:
+    """استخراج النصوص من جميع العلامات"""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # تحديد العلامات المراد استخراج النصوص منها
+    if specific_tags:
+        tags = soup.find_all(specific_tags)
+    else:
+        # استخراج جميع العلامات التي يمكن أن تحتوي على نص
+        tags = soup.find_all(True)
+    # فلترة العلامات المستبعدة
+    if exclude_tags:
+        tags = [tag for tag in tags if tag.name not in exclude_tags]
+    result = []
+    for tag in tags:
+        # الحصول على النص مع تنظيفه
+        text = tag.get_text(strip=True, separator=' ')
+        # إذا كان النص فارغاً ولا نريد تضمينه
+        if not text and not include_empty:
+            continue
+        # إضافة النتيجة إلى القائمة
+        result.append({
+            'text': text if text else '',
+            'tag': tag.name,
+            'attrs': dict(tag.attrs) if tag.attrs else {}
+        })
+    return result
+@app.get("/")
+async def root():
+    """الصفحة الرئيسية"""
+    return {
+        "message": "مرحباً بك في API استخراج نصوص صفحات الويب",
+        "endpoints": {
+            "POST /extract": "استخراج النصوص من رابط URL",
+            "GET /health": "فحص حالة الخدمة"
+        },
+        "usage": {
+            "example_request": {
+                "url": "https://example.com",
+                "include_empty_text": False,
+                "specific_tags": ["p", "h1", "h2", "h3"],
+                "exclude_tags": ["script", "style"]
+            }
+        }
+    }
+@app.post("/extract", response_model=List[Dict[str, str]])
+async def extract_text(request: URLRequest):
+    """
+    استخراج النصوص من صفحة ويب
+    - **url**: رابط الصفحة المراد تحليلها
+    - **include_empty_text**: تضمين العلامات الفارغة (افتراضي: False)
+    - **specific_tags**: قائمة بعلامات محددة لاستخراج النصوص منها (افتراضي: جميع العلامات)
+    - **exclude_tags**: قائمة بعلامات لتجاهلها (مثل: script, style)
+    """
+    # جلب محتوى الصفحة
+    html_content = fetch_webpage(str(request.url))
+    # استخراج النصوص
+    extracted_texts = extract_text_from_tags(
+        html_content,
+        include_empty=request.include_empty_text,
+        specific_tags=request.specific_tags,
+        exclude_tags=request.exclude_tags or ["script", "style", "meta", "link", "noscript"]
+    )
+    return extracted_texts
+@app.get("/extract")
+async def extract_text_get(url: str, include_empty: bool = False):
+    """نسخة GET من استخراج النصوص (للسهولة)"""
+    if not is_valid_url(url):
+        raise HTTPException(status_code=400, detail="رابط غير صالح")
+    html_content = fetch_webpage(url)
+    extracted_texts = extract_text_from_tags(html_content, include_empty)
+    return extracted_texts
+@app.get("/health")
+async def health_check():
+    """فحص حالة الخدمة"""
+    return {"status": "healthy", "service": "web-text-extractor"}
+@app.get("/tags-info")
+async def tags_info():
+    """معلومات عن العلامات الشائعة التي تحتوي على نص"""
+    common_tags = {
+        "headings": ["h1", "h2", "h3", "h4", "h5", "h6"],
+        "paragraphs": ["p"],
+        "lists": ["li", "dt", "dd"],
+        "text_formatting": ["span", "strong", "em", "b", "i", "mark", "small"],
+        "links": ["a"],
+        "tables": ["td", "th", "caption"],
+        "quotes": ["blockquote", "q"],
+        "other": ["div", "section", "article", "header", "footer", "nav", "main", "aside"]
+    }
+    return {
+        "info": "العلامات الشائعة التي تحتوي على نص في صفحات الويب",
+        "common_tags": common_tags,
+        "excluded_by_default": ["script", "style", "meta", "link", "noscript", "img", "br", "hr"]
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)