Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,67 +1,55 @@
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from pydantic import BaseModel, HttpUrl
|
| 3 |
-
import requests
|
| 4 |
-
from bs4 import BeautifulSoup
|
| 5 |
from typing import List, Dict
|
| 6 |
-
import
|
|
|
|
| 7 |
|
| 8 |
-
app = FastAPI(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
class URLRequest(BaseModel):
|
| 11 |
url: HttpUrl
|
|
|
|
| 12 |
|
| 13 |
class TextResponse(BaseModel):
|
| 14 |
text: str
|
| 15 |
|
| 16 |
@app.post("/extract", response_model=List[TextResponse])
|
| 17 |
-
async def
|
|
|
|
|
|
|
|
|
|
| 18 |
try:
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
}
|
| 23 |
-
response = requests.get(str(request.url), headers=headers, timeout=10)
|
| 24 |
-
response.raise_for_status()
|
| 25 |
-
|
| 26 |
-
# تحليل HTML
|
| 27 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
| 28 |
-
|
| 29 |
-
# إزالة العناصر غير المرغوب فيها
|
| 30 |
-
for element in soup(['script', 'style', 'meta', 'link', 'noscript']):
|
| 31 |
-
element.decompose()
|
| 32 |
-
|
| 33 |
-
# استخراج جميع النصوص من كل العناصر
|
| 34 |
-
text_elements = []
|
| 35 |
-
|
| 36 |
-
# البحث عن جميع العناصر التي تحتوي على نص
|
| 37 |
-
for element in soup.find_all(text=True):
|
| 38 |
-
text = element.strip()
|
| 39 |
-
# تنظيف النص وإزالة المسافات الزائدة
|
| 40 |
-
text = re.sub(r'\s+', ' ', text)
|
| 41 |
-
|
| 42 |
-
# تجاهل النصوص الفارغة والنصوص التي تحتوي على محارف خاصة فقط
|
| 43 |
-
if text and len(text) > 1 and not text.isspace():
|
| 44 |
-
text_elements.append({'text': text})
|
| 45 |
-
|
| 46 |
-
# إزالة النصوص المكررة
|
| 47 |
-
seen = set()
|
| 48 |
-
unique_texts = []
|
| 49 |
-
for item in text_elements:
|
| 50 |
-
if item['text'] not in seen:
|
| 51 |
-
seen.add(item['text'])
|
| 52 |
-
unique_texts.append(item)
|
| 53 |
-
|
| 54 |
-
return unique_texts
|
| 55 |
|
| 56 |
except requests.RequestException as e:
|
| 57 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 58 |
except Exception as e:
|
| 59 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
@app.get("/")
|
| 62 |
async def root():
|
| 63 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
import uvicorn
|
| 67 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException
|
| 2 |
from pydantic import BaseModel, HttpUrl
|
|
|
|
|
|
|
| 3 |
from typing import List, Dict
|
| 4 |
+
import requests
|
| 5 |
+
from helpers.text_blocks import extract_text_from_url
|
| 6 |
|
| 7 |
+
app = FastAPI(
|
| 8 |
+
title="Text Extractor API",
|
| 9 |
+
description="API لاستخراج النصوص من صفحات الويب",
|
| 10 |
+
version="1.0.0"
|
| 11 |
+
)
|
| 12 |
|
| 13 |
class URLRequest(BaseModel):
|
| 14 |
url: HttpUrl
|
| 15 |
+
timeout: int = 10
|
| 16 |
|
| 17 |
class TextResponse(BaseModel):
|
| 18 |
text: str
|
| 19 |
|
| 20 |
@app.post("/extract", response_model=List[TextResponse])
|
| 21 |
+
async def extract_text_endpoint(request: URLRequest):
|
| 22 |
+
"""
|
| 23 |
+
استخراج النصوص من رابط صفحة ويب
|
| 24 |
+
"""
|
| 25 |
try:
|
| 26 |
+
# استدعاء دالة الاستخراج
|
| 27 |
+
result = extract_text_from_url(str(request.url), request.timeout)
|
| 28 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
except requests.RequestException as e:
|
| 31 |
+
raise HTTPException(
|
| 32 |
+
status_code=400,
|
| 33 |
+
detail=f"خطأ في جلب الصفحة: {str(e)}"
|
| 34 |
+
)
|
| 35 |
except Exception as e:
|
| 36 |
+
raise HTTPException(
|
| 37 |
+
status_code=500,
|
| 38 |
+
detail=f"خطأ في معالجة المحتوى: {str(e)}"
|
| 39 |
+
)
|
| 40 |
|
| 41 |
@app.get("/")
|
| 42 |
async def root():
|
| 43 |
+
return {
|
| 44 |
+
"message": "Text Extractor API is running!",
|
| 45 |
+
"endpoint": "/extract-text",
|
| 46 |
+
"method": "POST"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
@app.get("/health")
|
| 50 |
+
async def health_check():
|
| 51 |
+
return {"status": "healthy"}
|
| 52 |
|
| 53 |
if __name__ == "__main__":
|
| 54 |
import uvicorn
|
| 55 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|