Spaces:

rajsecrets0
/

PDF_CHATBOT_2

Runtime error

App Files Files Community

PDF_CHATBOT_2 / app.py

rajsecrets0

Upload app.py

82df4eb about 2 years ago

raw

history blame contribute delete

2.48 kB

	pip install -qU cassio datasets langchain openai tiktoken

	# LangChain components to use
	from langchain.vectorstores.cassandra import Cassandra
	from langchain.indexes.vectorstore import VectorStoreIndexWrapper
	from langchain.llms import OpenAI
	from langchain.embeddings import OpenAIEmbeddings

	# Support for dataset retrieval with Hugging Face
	from datasets import load_dataset

	# With CassIO, the engine powering the Astra DB integration in LangChain,
	# you will also initialize the DB connection:
	import cassio

	pip install PyPDF2

	from PyPDF2 import PdfReader

	ASTRA_DB_APPLICATION_TOKEN = "AstraCS:OsOjMKLLxkWFoUpmNbWeJwIP:d8b4df7fd17c288edd265f9d167fa821e97e9d97098842c2e3ed4140d756d02d"
	ASTRA_DB_ID = "f97bbcce-b48b-4b42-8ad0-fdc38b2e165e" # enter your Database ID
	OPENAI_API_KEY = "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" # enter your OpenAI key

	# provide the path of pdf file/files.
	pdfreader = PdfReader('Ethics.pdf')

	from typing_extensions import Concatenate
	# read text from pdf
	raw_text = ''
	for i, page in enumerate(pdfreader.pages):
	content = page.extract_text()
	if content:
	raw_text += content


	cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

	llm = OpenAI(openai_api_key=OPENAI_API_KEY)
	embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

	astra_vector_store = Cassandra(
	embedding=embedding,
	table_name="qa_mini_demo",
	session=None,
	keyspace=None,
	)


	from langchain.text_splitter import CharacterTextSplitter
	# We need to split the text using Character Text Split such that it sshould not increse token size
	text_splitter = CharacterTextSplitter(
	separator = "\n",
	chunk_size = 800,
	chunk_overlap = 200,
	length_function = len,
	)
	texts = text_splitter.split_text(raw_text)



	astra_vector_store.add_texts(texts[:])

	print("Inserted %i headlines." % len(texts[:]))

	astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

	first_question = True
	while True:
	if first_question:
	query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
	else:
	query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

	if query_text.lower() == "quit":
	break

	if query_text == "":
	continue

	first_question = False

	print("\nQUESTION: \"%s\"" % query_text)
	answer = astra_vector_index.query(query_text, llm=llm).strip()
	print("ANSWER: \"%s\"\n" % answer)