Mazenbs commited on
Commit
4481124
·
verified ·
1 Parent(s): e644b0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -45
app.py CHANGED
@@ -1,67 +1,55 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel, HttpUrl
3
- import requests
4
- from bs4 import BeautifulSoup
5
  from typing import List, Dict
6
- import re
 
7
 
8
- app = FastAPI(title="Text Extractor API", version="1.0.0")
 
 
 
 
9
 
10
  class URLRequest(BaseModel):
11
  url: HttpUrl
 
12
 
13
  class TextResponse(BaseModel):
14
  text: str
15
 
16
  @app.post("/extract", response_model=List[TextResponse])
17
- async def extract_text_from_url(request: URLRequest):
 
 
 
18
  try:
19
- # إرسال طلب GET للرابط
20
- headers = {
21
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
22
- }
23
- response = requests.get(str(request.url), headers=headers, timeout=10)
24
- response.raise_for_status()
25
-
26
- # تحليل HTML
27
- soup = BeautifulSoup(response.content, 'html.parser')
28
-
29
- # إزالة العناصر غير المرغوب فيها
30
- for element in soup(['script', 'style', 'meta', 'link', 'noscript']):
31
- element.decompose()
32
-
33
- # استخراج جميع النصوص من كل العناصر
34
- text_elements = []
35
-
36
- # البحث عن جميع العناصر التي تحتوي على نص
37
- for element in soup.find_all(text=True):
38
- text = element.strip()
39
- # تنظيف النص وإزالة المسافات الزائدة
40
- text = re.sub(r'\s+', ' ', text)
41
-
42
- # تجاهل النصوص الفارغة والنصوص التي تحتوي على محارف خاصة فقط
43
- if text and len(text) > 1 and not text.isspace():
44
- text_elements.append({'text': text})
45
-
46
- # إزالة النصوص المكررة
47
- seen = set()
48
- unique_texts = []
49
- for item in text_elements:
50
- if item['text'] not in seen:
51
- seen.add(item['text'])
52
- unique_texts.append(item)
53
-
54
- return unique_texts
55
 
56
  except requests.RequestException as e:
57
- raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}")
 
 
 
58
  except Exception as e:
59
- raise HTTPException(status_code=500, detail=f"Error processing content: {str(e)}")
 
 
 
60
 
61
  @app.get("/")
62
  async def root():
63
- return {"message": "Text Extractor API is running! Send POST request to /extract-text"}
 
 
 
 
 
 
 
 
64
 
65
  if __name__ == "__main__":
66
  import uvicorn
67
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel, HttpUrl
 
 
3
  from typing import List, Dict
4
+ import requests
5
+ from helpers.text_blocks import extract_text_from_url
6
 
7
+ app = FastAPI(
8
+ title="Text Extractor API",
9
+ description="API لاستخراج النصوص من صفحات الويب",
10
+ version="1.0.0"
11
+ )
12
 
13
  class URLRequest(BaseModel):
14
  url: HttpUrl
15
+ timeout: int = 10
16
 
17
  class TextResponse(BaseModel):
18
  text: str
19
 
20
  @app.post("/extract", response_model=List[TextResponse])
21
+ async def extract_text_endpoint(request: URLRequest):
22
+ """
23
+ استخراج النصوص من رابط صفحة ويب
24
+ """
25
  try:
26
+ # استدعاء دالة الاستخراج
27
+ result = extract_text_from_url(str(request.url), request.timeout)
28
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  except requests.RequestException as e:
31
+ raise HTTPException(
32
+ status_code=400,
33
+ detail=f"خطأ في جلب الصفحة: {str(e)}"
34
+ )
35
  except Exception as e:
36
+ raise HTTPException(
37
+ status_code=500,
38
+ detail=f"خطأ في معالجة المحتوى: {str(e)}"
39
+ )
40
 
41
  @app.get("/")
42
  async def root():
43
+ return {
44
+ "message": "Text Extractor API is running!",
45
+ "endpoint": "/extract-text",
46
+ "method": "POST"
47
+ }
48
+
49
+ @app.get("/health")
50
+ async def health_check():
51
+ return {"status": "healthy"}
52
 
53
  if __name__ == "__main__":
54
  import uvicorn
55
+ uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)