URL2Text2

Sleeping

App Files Files Community

tregu0458 commited on Jun 21, 2024

Commit

6b02d5a

verified ·

1 Parent(s): 30ffb59

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -5

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security import OAuth2PasswordBearer
 from langchain_community.document_loaders import YoutubeLoader, UnstructuredPDFLoader, WebBaseLoader
 from langchain_community.document_loaders import OnlinePDFLoader
 app = FastAPI()
 API_KEY = os.environ["API_KEY"]
@@ -34,9 +34,10 @@ def extract_text(url: str, language: str = "ja", length: int = 150000):
             text_content = docs[0].page_content
         else:
             # それ以外の場合
-            loader = WebBaseLoader(url)
-            docs = loader.load()
-            text_content = docs[0].page_content
         if len(text_content) < length:
             return {"text_content": text_content}
@@ -47,4 +48,37 @@ def extract_text(url: str, language: str = "ja", length: int = 150000):
             }
     except Exception as e:
         error_msg = str(e)
-        return {"message": error_msg}

 from fastapi.security import OAuth2PasswordBearer
 from langchain_community.document_loaders import YoutubeLoader, UnstructuredPDFLoader, WebBaseLoader
 from langchain_community.document_loaders import OnlinePDFLoader
+from bs4 import BeautifulSoup
 app = FastAPI()
 API_KEY = os.environ["API_KEY"]
             text_content = docs[0].page_content
         else:
             # それ以外の場合
+            # loader = WebBaseLoader(url)
+            # docs = loader.load()
+            # text_content = docs[0].page_content
+            text_content = str(fetch_and_convert_to_markdown(url))
         if len(text_content) < length:
             return {"text_content": text_content}
             }
     except Exception as e:
         error_msg = str(e)
+        return {"message": error_msg}
+def fetch_and_convert_to_markdown(url):
+    response = requests.get(url)
+    if response.status_code != 200:
+        return f"エラー: ステータスコード {response.status_code}"
+    soup = BeautifulSoup(response.text, 'html.parser')
+    markdown = ""
+    # タイトル
+    if soup.title:
+        markdown += f"# {soup.title.string.strip()}\n\n"
+    # メインコンテンツ（この例では body タグ内のコンテンツを対象とします）
+    main_content = soup.body
+    if main_content:
+        for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'ul', 'ol']):
+            if element.name.startswith('h'):
+                level = int(element.name[1])
+                markdown += f"{'#' * level} {element.get_text().strip()}\n\n"
+            elif element.name == 'p':
+                markdown += f"{element.get_text().strip()}\n\n"
+            elif element.name == 'a':
+                href = element.get('href')
+                if href:
+                    full_url = urljoin(url, href)
+                    markdown += f"[{element.get_text().strip()}]({full_url})\n\n"
+            elif element.name in ['ul', 'ol']:
+                for li in element.find_all('li'):
+                    markdown += f"- {li.get_text().strip()}\n"
+                markdown += "\n"
+    return markdown