Spaces:

Mazenbs
/

extract_html_full

Running

App Files Files Community

Mazenbs commited on Dec 3, 2025

Commit

4481124

verified ·

1 Parent(s): e644b0b

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -45

app.py CHANGED Viewed

@@ -1,67 +1,55 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, HttpUrl
-import requests
-from bs4 import BeautifulSoup
 from typing import List, Dict
-import re
-app = FastAPI(title="Text Extractor API", version="1.0.0")
 class URLRequest(BaseModel):
     url: HttpUrl
 class TextResponse(BaseModel):
     text: str
 @app.post("/extract", response_model=List[TextResponse])
-async def extract_text_from_url(request: URLRequest):
     try:
-        # إرسال طلب GET للرابط
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        }
-        response = requests.get(str(request.url), headers=headers, timeout=10)
-        response.raise_for_status()
-        # تحليل HTML
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # إزالة العناصر غير المرغوب فيها
-        for element in soup(['script', 'style', 'meta', 'link', 'noscript']):
-            element.decompose()
-        # استخراج جميع النصوص من كل العناصر
-        text_elements = []
-        # البحث عن جميع العناصر التي تحتوي على نص
-        for element in soup.find_all(text=True):
-            text = element.strip()
-            # تنظيف النص وإزالة المسافات الزائدة
-            text = re.sub(r'\s+', ' ', text)
-            # تجاهل النصوص الفارغة والنصوص التي تحتوي على محارف خاصة فقط
-            if text and len(text) > 1 and not text.isspace():
-                text_elements.append({'text': text})
-        # إزالة النصوص المكررة
-        seen = set()
-        unique_texts = []
-        for item in text_elements:
-            if item['text'] not in seen:
-                seen.add(item['text'])
-                unique_texts.append(item)
-        return unique_texts
     except requests.RequestException as e:
-        raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing content: {str(e)}")
 @app.get("/")
 async def root():
-    return {"message": "Text Extractor API is running! Send POST request to /extract-text"}
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, HttpUrl
 from typing import List, Dict
+import requests
+from helpers.text_blocks import extract_text_from_url
+app = FastAPI(
+    title="Text Extractor API",
+    description="API لاستخراج النصوص من صفحات الويب",
+    version="1.0.0"
+)
 class URLRequest(BaseModel):
     url: HttpUrl
+    timeout: int = 10
 class TextResponse(BaseModel):
     text: str
 @app.post("/extract", response_model=List[TextResponse])
+async def extract_text_endpoint(request: URLRequest):
+    """
+    استخراج النصوص من رابط صفحة ويب
+    """
     try:
+        # استدعاء دالة الاستخراج
+        result = extract_text_from_url(str(request.url), request.timeout)
+        return result
     except requests.RequestException as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"خطأ في جلب الصفحة: {str(e)}"
+        )
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"خطأ في معالجة المحتوى: {str(e)}"
+        )
 @app.get("/")
 async def root():
+    return {
+        "message": "Text Extractor API is running!",
+        "endpoint": "/extract-text",
+        "method": "POST"
+    }
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)