Spaces:
Sleeping
Sleeping
UPDATE: web crawler
Browse files- app.py +4 -8
- functions.py +34 -1
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
import io
|
| 2 |
-
import re
|
| 3 |
from functions import *
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
-
from bs4 import BeautifulSoup
|
| 6 |
from fastapi import FastAPI, File, UploadFile
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
-
from langchain_community.document_loaders import
|
| 9 |
|
| 10 |
|
| 11 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
|
@@ -52,12 +50,10 @@ async def addText(vectorstore: str, text: str):
|
|
| 52 |
|
| 53 |
@app.post("/addWebsite")
|
| 54 |
async def addWebsite(vectorstore: str, websiteUrl: str):
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
return re.sub(r"\n\n+", "\n\n", soup.text).strip()
|
| 58 |
-
loader = RecursiveUrlLoader(websiteUrl, max_depth=2, timeout = 60, extractor=bs4_extractor)
|
| 59 |
docs = loader.load()
|
| 60 |
-
text = "\n\n".join([docs[doc].page_content for doc in range(len(docs))])
|
| 61 |
return addDocuments(text = text, vectorstore = vectorstore)
|
| 62 |
|
| 63 |
|
|
|
|
| 1 |
import io
|
|
|
|
| 2 |
from functions import *
|
| 3 |
from PyPDF2 import PdfReader
|
|
|
|
| 4 |
from fastapi import FastAPI, File, UploadFile
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
+
from langchain_community.document_loaders import UnstructuredURLLoader
|
| 7 |
|
| 8 |
|
| 9 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
|
|
|
| 50 |
|
| 51 |
@app.post("/addWebsite")
|
| 52 |
async def addWebsite(vectorstore: str, websiteUrl: str):
|
| 53 |
+
urls = getLinks("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
| 54 |
+
loader = UnstructuredURLLoader(urls=urls)
|
|
|
|
|
|
|
| 55 |
docs = loader.load()
|
| 56 |
+
text = "\n\n\n\n".join([f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
|
| 57 |
return addDocuments(text = text, vectorstore = vectorstore)
|
| 58 |
|
| 59 |
|
functions.py
CHANGED
|
@@ -18,9 +18,14 @@ from langchain.retrievers.document_compressors import FlashrankRerank
|
|
| 18 |
from supabase.client import create_client
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
|
|
|
|
|
|
| 21 |
from supabase import create_client
|
| 22 |
from dotenv import load_dotenv
|
| 23 |
import os
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
load_dotenv("secrets.env")
|
| 26 |
client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
|
|
@@ -243,4 +248,32 @@ def listTables(username: str):
|
|
| 243 |
except Exception as e:
|
| 244 |
return {
|
| 245 |
"error": e
|
| 246 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from supabase.client import create_client
|
| 19 |
from qdrant_client import QdrantClient
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
+
from bs4 import BeautifulSoup
|
| 22 |
+
from urllib.parse import urlparse
|
| 23 |
from supabase import create_client
|
| 24 |
from dotenv import load_dotenv
|
| 25 |
import os
|
| 26 |
+
import time
|
| 27 |
+
import requests
|
| 28 |
+
|
| 29 |
|
| 30 |
load_dotenv("secrets.env")
|
| 31 |
client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
|
|
|
|
| 248 |
except Exception as e:
|
| 249 |
return {
|
| 250 |
"error": e
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def getLinks(url: str, timeout = 30):
|
| 255 |
+
start = time.time()
|
| 256 |
+
def getLinksFromPage(url: str):
|
| 257 |
+
response = requests.get(url)
|
| 258 |
+
htmlContent = response.content
|
| 259 |
+
soup = BeautifulSoup(htmlContent, "lxml")
|
| 260 |
+
anchorTags = soup.find_all("a")
|
| 261 |
+
allLinks = []
|
| 262 |
+
for tag in anchorTags:
|
| 263 |
+
if "href" in tag.attrs:
|
| 264 |
+
if urlparse(tag.attrs["href"]).netloc == urlparse(url).netloc:
|
| 265 |
+
allLinks.append(tag.attrs["href"])
|
| 266 |
+
else:
|
| 267 |
+
continue
|
| 268 |
+
else:
|
| 269 |
+
continue
|
| 270 |
+
return allLinks
|
| 271 |
+
links = getLinksFromPage(url)
|
| 272 |
+
uniqueLinks = set()
|
| 273 |
+
for link in links:
|
| 274 |
+
now = time.time()
|
| 275 |
+
if now - start > timeout:
|
| 276 |
+
break
|
| 277 |
+
else:
|
| 278 |
+
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
| 279 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
requirements.txt
CHANGED
|
@@ -14,4 +14,6 @@ lxml
|
|
| 14 |
PyPDF2
|
| 15 |
python-dotenv
|
| 16 |
sentence-transformers
|
| 17 |
-
supabase
|
|
|
|
|
|
|
|
|
| 14 |
PyPDF2
|
| 15 |
python-dotenv
|
| 16 |
sentence-transformers
|
| 17 |
+
supabase
|
| 18 |
+
unstructured
|
| 19 |
+
urllib3
|