pvanand commited on
Commit
83ff685
·
verified ·
1 Parent(s): d27546b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +52 -15
main.py CHANGED
@@ -1,14 +1,30 @@
1
  import os
2
  import asyncio
3
- from fastapi import FastAPI, HTTPException
 
4
  from pydantic import BaseModel, Field, create_model
5
  from typing import List, Optional
6
  from crawl4ai import AsyncWebCrawler
7
- from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
8
  import json
 
 
 
 
 
9
 
10
  app = FastAPI()
11
 
 
 
 
 
 
 
 
 
 
 
12
  class CrawlerInput(BaseModel):
13
  url: str = Field(..., description="URL to crawl")
14
  columns: List[str] = Field(..., description="List of required columns")
@@ -20,31 +36,26 @@ class CrawlerOutput(BaseModel):
20
  async def simple_crawl():
21
  async with AsyncWebCrawler(verbose=True) as crawler:
22
  result = await crawler.arun(url="https://www.nbcnews.com/business")
23
- print(len(result.markdown))
24
- return result
25
 
26
  @app.post("/crawl", response_model=CrawlerOutput)
27
- async def crawl(input: CrawlerInput):
28
  if len(input.columns) != len(input.descriptions):
29
  raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions")
30
-
31
- # Create a dictionary with columns as keys and descriptions as values
32
  extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)}
33
-
34
- # Create a dynamic Pydantic model based on the input columns and descriptions
35
  dynamic_model = create_model(
36
  'DynamicModel',
37
  **{col: (str, Field(..., description=desc)) for col, desc in extraction_info.items()}
38
  )
39
-
40
- # Convert the dictionary to a JSON string for the instruction
41
  instruction = f"Extract the following information: {json.dumps(extraction_info)}"
42
-
43
  async with AsyncWebCrawler(verbose=True) as crawler:
44
  result = await crawler.arun(
45
  url=input.url,
46
  extraction_strategy=LLMExtractionStrategy(
47
- provider="openai/gpt-4o-mini",
48
  api_token=os.getenv('OPENAI_API_KEY'),
49
  schema=dynamic_model.schema(),
50
  extraction_type="schema",
@@ -52,15 +63,41 @@ async def crawl(input: CrawlerInput):
52
  instruction=instruction
53
  )
54
  )
55
-
56
  extracted_data = json.loads(result.extracted_content)
57
  return CrawlerOutput(data=extracted_data)
58
 
59
  @app.get("/test")
60
- async def test():
61
  result = await simple_crawl()
62
  return {"markdown": result.markdown}
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if __name__ == "__main__":
65
  import uvicorn
66
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  import os
2
  import asyncio
3
+ from fastapi import FastAPI, HTTPException, Security, Depends
4
+ from fastapi.security import APIKeyHeader
5
  from pydantic import BaseModel, Field, create_model
6
  from typing import List, Optional
7
  from crawl4ai import AsyncWebCrawler
8
+ from crawl4api.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
9
  import json
10
+ import logging
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
 
16
  app = FastAPI()
17
 
18
+ # API key configuration
19
+ API_KEY = os.getenv("API_KEY")
20
+ api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
21
+
22
+ async def verify_api_key(api_key: str = Security(api_key_header)):
23
+ if api_key != API_KEY:
24
+ logger.warning("Invalid API key used")
25
+ raise HTTPException(status_code=403, detail="Could not validate credentials")
26
+ return api_key
27
+
28
  class CrawlerInput(BaseModel):
29
  url: str = Field(..., description="URL to crawl")
30
  columns: List[str] = Field(..., description="List of required columns")
 
36
  async def simple_crawl():
37
  async with AsyncWebCrawler(verbose=True) as crawler:
38
  result = await crawler.arun(url="https://www.nbcnews.com/business")
39
+ print(len(result.markdown))
40
+ return result
41
 
42
  @app.post("/crawl", response_model=CrawlerOutput)
43
+ async def crawl(input: CrawlerInput, api_key: str = Depends(verify_api_key)):
44
  if len(input.columns) != len(input.descriptions):
45
  raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions")
46
+
 
47
  extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)}
 
 
48
  dynamic_model = create_model(
49
  'DynamicModel',
50
  **{col: (str, Field(..., description=desc)) for col, desc in extraction_info.items()}
51
  )
 
 
52
  instruction = f"Extract the following information: {json.dumps(extraction_info)}"
53
+
54
  async with AsyncWebCrawler(verbose=True) as crawler:
55
  result = await crawler.arun(
56
  url=input.url,
57
  extraction_strategy=LLMExtractionStrategy(
58
+ provider="openai/gpt-4-mini",
59
  api_token=os.getenv('OPENAI_API_KEY'),
60
  schema=dynamic_model.schema(),
61
  extraction_type="schema",
 
63
  instruction=instruction
64
  )
65
  )
66
+
67
  extracted_data = json.loads(result.extracted_content)
68
  return CrawlerOutput(data=extracted_data)
69
 
70
  @app.get("/test")
71
+ async def test(api_key: str = Depends(verify_api_key)):
72
  result = await simple_crawl()
73
  return {"markdown": result.markdown}
74
 
75
+ from fastapi.middleware.cors import CORSMiddleware
76
+
77
+ # CORS middleware setup
78
+ app.add_middleware(
79
+ CORSMiddleware,
80
+ #allow_origins=["*"],
81
+ allow_origins=[
82
+ "http://127.0.0.1:5501/",
83
+ "http://localhost:5501",
84
+ "http://localhost:3000",
85
+ "https://www.elevaticsai.com",
86
+ "https://www.elevatics.cloud",
87
+ "https://www.elevatics.online",
88
+ "https://www.elevatics.ai",
89
+ "https://elevaticsai.com",
90
+ "https://elevatics.cloud",
91
+ "https://elevatics.online",
92
+ "https://elevatics.ai",
93
+ "https://pvanand-specialized-agents.hf.space",
94
+ "https://pvanand-audio-chat.hf.space/"
95
+ ],
96
+ allow_credentials=True,
97
+ allow_methods=["GET", "POST"],
98
+ allow_headers=["*"],
99
+ )
100
+
101
  if __name__ == "__main__":
102
  import uvicorn
103
  uvicorn.run(app, host="0.0.0.0", port=8000)