tregu0458 commited on
Commit
c8bcd4e
·
verified ·
1 Parent(s): 6fabfc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -1
app.py CHANGED
@@ -59,7 +59,7 @@ def extract_text(url: str, language: str = "ja", length: int = 150000,use_jina:b
59
  return {"message": error_msg}
60
 
61
  @app.post("/httpx_bs", tags=["Text Extraction and beautiful soup"], dependencies=[Depends(validate_token)])
62
- def extract_text(url: str, length: int = 150000):
63
  try:
64
  response = httpx.get(url)
65
  text_content = str(convert_to_markdown(response,url))
@@ -75,6 +75,37 @@ def extract_text(url: str, length: int = 150000):
75
  error_msg = str(e)
76
  return {"message": error_msg}
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def convert_to_markdown(response_text,url):
79
  # if response.status_code != 200:
80
  # return f"エラー: ステータスコード {response.status_code}"
 
59
  return {"message": error_msg}
60
 
61
  @app.post("/httpx_bs", tags=["Text Extraction and beautiful soup"], dependencies=[Depends(validate_token)])
62
+ def httpx_bs(url: str, length: int = 150000):
63
  try:
64
  response = httpx.get(url)
65
  text_content = str(convert_to_markdown(response,url))
 
75
  error_msg = str(e)
76
  return {"message": error_msg}
77
 
78
+ @app.post("/extract_from_url", tags=["Text Extraction from URL"], dependencies=[Depends(validate_token)])
79
+ def extract_from_url(url: str, length: int = 150000, tool: str = "httpx"):
80
+ try:
81
+ if tool == "jina":
82
+ response = requests.get("https://r.jina.ai/" + url)
83
+ text_content = response.text
84
+ elif tool == "httpx":
85
+ response = httpx.get(url)
86
+ text_content = str(convert_to_markdown(response.text, url))
87
+ elif tool == "requests":
88
+ response = requests.get(url, timeout=10)
89
+ text_content = str(convert_to_markdown(response.text, url))
90
+ elif tool == "webbaseloader":
91
+ loader = WebBaseLoader(url)
92
+ docs = loader.load()
93
+ text_content = docs[0].page_content
94
+ else:
95
+ raise ValueError("Invalid tool specified. Choose from 'jina', 'httpx', 'requests', or 'webbaseloader'.")
96
+
97
+ if len(text_content) < length:
98
+ return {"text_content": text_content}
99
+ else:
100
+ return {
101
+ "text_content": text_content[: int(length / 2)]
102
+ + text_content[len(text_content) - int(length / 2) :]
103
+ }
104
+ except Exception as e:
105
+ error_msg = str(e)
106
+ return {"message": error_msg}
107
+
108
+
109
  def convert_to_markdown(response_text,url):
110
  # if response.status_code != 200:
111
  # return f"エラー: ステータスコード {response.status_code}"