chat-langchain

Paused

App Files Files Community

chat-langchain / ingest.py

christopherbfrance

Duplicate from hwchase17/chat-langchain

f700bae about 3 years ago

raw

history blame contribute delete

2.55 kB

	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path

	import weaviate
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter


	def clean_data(data):
	soup = BeautifulSoup(data)
	text = soup.find_all("main", {"id": "main-content"})[0].get_text()
	return "\n".join([t for t in text.split("\n") if t])


	docs = []
	metadatas = []
	for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
	if p.is_dir():
	continue
	with open(p) as f:
	docs.append(clean_data(f.read()))
	metadatas.append({"source": p})


	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)


	WEAVIATE_URL = os.environ["WEAVIATE_URL"]
	client = weaviate.Client(
	url=WEAVIATE_URL,
	additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
	)

	client.schema.delete_class("Paragraph")
	client.schema.get()
	schema = {
	"classes": [
	{
	"class": "Paragraph",
	"description": "A written paragraph",
	"vectorizer": "text2vec-openai",
	"moduleConfig": {
	"text2vec-openai": {
	"model": "ada",
	"modelVersion": "002",
	"type": "text",
	}
	},
	"properties": [
	{
	"dataType": ["text"],
	"description": "The content of the paragraph",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": False,
	"vectorizePropertyName": False,
	}
	},
	"name": "content",
	},
	{
	"dataType": ["text"],
	"description": "The link",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": True,
	"vectorizePropertyName": False,
	}
	},
	"name": "source",
	},
	],
	},
	]
	}

	client.schema.create(schema)

	with client.batch as batch:
	for text in documents:
	batch.add_data_object(
	{"content": text.page_content, "source": str(text.metadata["source"])},
	"Paragraph",
	)