tregu0458 commited on
Commit
6b02d5a
·
verified ·
1 Parent(s): 30ffb59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -5
app.py CHANGED
@@ -4,7 +4,7 @@ from fastapi import FastAPI, HTTPException, Depends
4
  from fastapi.security import OAuth2PasswordBearer
5
  from langchain_community.document_loaders import YoutubeLoader, UnstructuredPDFLoader, WebBaseLoader
6
  from langchain_community.document_loaders import OnlinePDFLoader
7
-
8
  app = FastAPI()
9
 
10
  API_KEY = os.environ["API_KEY"]
@@ -34,9 +34,10 @@ def extract_text(url: str, language: str = "ja", length: int = 150000):
34
  text_content = docs[0].page_content
35
  else:
36
  # それ以外の場合
37
- loader = WebBaseLoader(url)
38
- docs = loader.load()
39
- text_content = docs[0].page_content
 
40
 
41
  if len(text_content) < length:
42
  return {"text_content": text_content}
@@ -47,4 +48,37 @@ def extract_text(url: str, language: str = "ja", length: int = 150000):
47
  }
48
  except Exception as e:
49
  error_msg = str(e)
50
- return {"message": error_msg}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from fastapi.security import OAuth2PasswordBearer
5
  from langchain_community.document_loaders import YoutubeLoader, UnstructuredPDFLoader, WebBaseLoader
6
  from langchain_community.document_loaders import OnlinePDFLoader
7
+ from bs4 import BeautifulSoup
8
  app = FastAPI()
9
 
10
  API_KEY = os.environ["API_KEY"]
 
34
  text_content = docs[0].page_content
35
  else:
36
  # それ以外の場合
37
+ # loader = WebBaseLoader(url)
38
+ # docs = loader.load()
39
+ # text_content = docs[0].page_content
40
+ text_content = str(fetch_and_convert_to_markdown(url))
41
 
42
  if len(text_content) < length:
43
  return {"text_content": text_content}
 
48
  }
49
  except Exception as e:
50
  error_msg = str(e)
51
+ return {"message": error_msg}
52
+
53
+ def fetch_and_convert_to_markdown(url):
54
+ response = requests.get(url)
55
+ if response.status_code != 200:
56
+ return f"エラー: ステータスコード {response.status_code}"
57
+
58
+ soup = BeautifulSoup(response.text, 'html.parser')
59
+ markdown = ""
60
+
61
+ # タイトル
62
+ if soup.title:
63
+ markdown += f"# {soup.title.string.strip()}\n\n"
64
+
65
+ # メインコンテンツ(この例では body タグ内のコンテンツを対象とします)
66
+ main_content = soup.body
67
+ if main_content:
68
+ for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'ul', 'ol']):
69
+ if element.name.startswith('h'):
70
+ level = int(element.name[1])
71
+ markdown += f"{'#' * level} {element.get_text().strip()}\n\n"
72
+ elif element.name == 'p':
73
+ markdown += f"{element.get_text().strip()}\n\n"
74
+ elif element.name == 'a':
75
+ href = element.get('href')
76
+ if href:
77
+ full_url = urljoin(url, href)
78
+ markdown += f"[{element.get_text().strip()}]({full_url})\n\n"
79
+ elif element.name in ['ul', 'ol']:
80
+ for li in element.find_all('li'):
81
+ markdown += f"- {li.get_text().strip()}\n"
82
+ markdown += "\n"
83
+
84
+ return markdown