Spaces:

tregu0458
/

URL2Text

Sleeping

tregu0458 commited on May 12, 2024

Commit

7a0c4e5

verified ·

1 Parent(s): 451bc87

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+from fastapi import FastAPI, HTTPException
+from langchain.document_loaders import YoutubeLoader, UnstructuredPDFLoader, WebBaseLoader
+app = FastAPI()
+@app.post("/extract_text", tags=["Text Extraction"])
+def extract_text(url: str, language: str = "ja", length: int = 150000):
+    try:
+        if "youtube.com" in url or "youtu.be" in url:
+            # YouTubeの場合
+            loader = YoutubeLoader.from_youtube_url(
+                youtube_url=url,
+                add_video_info=True,
+                language=[language],
+            )
+            docs = loader.load()
+            text_content = str(docs)
+        elif url.endswith(".pdf"):
+            # PDFの場合
+            loader = UnstructuredPDFLoader(url)
+            docs = loader.load()
+            text_content = docs[0].page_content
+        else:
+            # それ以外の場合
+            loader = WebBaseLoader(url)
+            docs = loader.load()
+            text_content = docs[0].page_content
+        if len(text_content) < length:
+            return {"text_content": text_content}
+        else:
+            return {
+                "text_content": text_content[: int(length / 2)]
+                + text_content[len(text_content) - int(length / 2) :]
+            }
+    except Exception as e:
+        error_msg = str(e)
+        return {"message": error_msg}