Spaces:

Sahar7888
/

Climate_Change_LLM

Sleeping

App Files Files Community

Climate_Change_LLM / app.py

Sahar7888

Update app.py

95e0871 verified almost 2 years ago

raw

history blame contribute delete

3.19 kB


	import os
	import re
	from pprint import pprint
	from pypdf import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
	import chromadb
	import openai
	from openai import OpenAI
	import gradio as gr

	# Setup environment variables and OpenAI API key
	os.environ['OPENAI_API_KEY'] = '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'
	openai.api_key = os.environ["OPENAI_API_KEY"]


	# Load and preprocess the PDF document
	ipcc_report_file = "IPCC_AR6_WGII_TechnicalSummary (1).pdf"
	reader = PdfReader(ipcc_report_file)
	ipcc_texts = [page.extract_text().strip() for page in reader.pages]

	# Filter out beginning and end of document
	ipcc_texts_filt = ipcc_texts[5: -5]

	# Remove all header / footer texts
	ipcc_wo_header_footer = [re.sub(r'\d+\nTechnical Summary', '', s) for s in ipcc_texts_filt]
	ipcc_wo_header_footer = [re.sub(r'\nTS', '', s) for s in ipcc_wo_header_footer]
	ipcc_wo_header_footer = [re.sub(r'TS\n', '', s) for s in ipcc_wo_header_footer]

	# Split the document text into chunks
	char_splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n", ". ", " ", ""],
	chunk_size=1000,
	chunk_overlap=0.2
	)
	texts_char_splitted = char_splitter.split_text('\n\n'.join(ipcc_wo_header_footer))

	# Split text into tokens
	token_splitter = SentenceTransformersTokenTextSplitter(
	chunk_overlap=0.2,
	tokens_per_chunk=256
	)
	texts_token_splitted = []
	for text in texts_char_splitted:
	try:
	texts_token_splitted.extend(token_splitter.split_text(text))
	except Exception as e:
	print(f"Error in text: {text}, {e}")
	continue

	# Create and populate the vector database
	chroma_client = chromadb.PersistentClient(path="db")
	chroma_collection = chroma_client.get_or_create_collection("ipcc")
	ids = [str(i) for i in range(len(texts_token_splitted))]
	chroma_collection.add(
	ids=ids,
	documents=texts_token_splitted
	)

	# Define the rag function based on the notebook content
	def rag(query, n_results=5):
	res = chroma_collection.query(query_texts=[query], n_results=n_results)
	docs = res["documents"][0]
	joined_information = ';'.join([f'{doc}' for doc in docs])
	messages = [
	{
	"role": "system",
	"content": "You are a helpful expert on climate change. Your users are asking questions about information contained in attached information."
	"You will be shown the user's question, and the relevant information. Answer the user's question using only this information."
	},
	{"role": "user", "content": f"Question: {query}. \n Information: {joined_information}"}
	]
	openai_client = OpenAI()
	model = "gpt-3.5-turbo"
	response = openai_client.chat.completions.create(
	model=model,
	messages=messages,
	)
	content = response.choices[0].message.content
	return content

	# Create Gradio interface
	iface = gr.Interface(
	fn=rag,
	inputs=["text"],
	outputs="text",
	title="Climate Change RAG (Using OpenAI)",
	description="Ask questions about the impact of climate change and get answers based on the provided document."
	)

	# Launch the interface
	iface.launch(debug = True)