import os import re from pprint import pprint from pypdf import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter import chromadb import openai from openai import OpenAI import gradio as gr # Setup environment variables and OpenAI API key os.environ['OPENAI_API_KEY'] = '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' openai.api_key = os.environ["OPENAI_API_KEY"] # Load and preprocess the PDF document ipcc_report_file = "IPCC_AR6_WGII_TechnicalSummary (1).pdf" reader = PdfReader(ipcc_report_file) ipcc_texts = [page.extract_text().strip() for page in reader.pages] # Filter out beginning and end of document ipcc_texts_filt = ipcc_texts[5: -5] # Remove all header / footer texts ipcc_wo_header_footer = [re.sub(r'\d+\nTechnical Summary', '', s) for s in ipcc_texts_filt] ipcc_wo_header_footer = [re.sub(r'\nTS', '', s) for s in ipcc_wo_header_footer] ipcc_wo_header_footer = [re.sub(r'TS\n', '', s) for s in ipcc_wo_header_footer] # Split the document text into chunks char_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=0.2 ) texts_char_splitted = char_splitter.split_text('\n\n'.join(ipcc_wo_header_footer)) # Split text into tokens token_splitter = SentenceTransformersTokenTextSplitter( chunk_overlap=0.2, tokens_per_chunk=256 ) texts_token_splitted = [] for text in texts_char_splitted: try: texts_token_splitted.extend(token_splitter.split_text(text)) except Exception as e: print(f"Error in text: {text}, {e}") continue # Create and populate the vector database chroma_client = chromadb.PersistentClient(path="db") chroma_collection = chroma_client.get_or_create_collection("ipcc") ids = [str(i) for i in range(len(texts_token_splitted))] chroma_collection.add( ids=ids, documents=texts_token_splitted ) # Define the rag function based on the notebook content def rag(query, n_results=5): res = chroma_collection.query(query_texts=[query], n_results=n_results) docs = res["documents"][0] joined_information = ';'.join([f'{doc}' for doc in docs]) messages = [ { "role": "system", "content": "You are a helpful expert on climate change. Your users are asking questions about information contained in attached information." "You will be shown the user's question, and the relevant information. Answer the user's question using only this information." }, {"role": "user", "content": f"Question: {query}. \n Information: {joined_information}"} ] openai_client = OpenAI() model = "gpt-3.5-turbo" response = openai_client.chat.completions.create( model=model, messages=messages, ) content = response.choices[0].message.content return content # Create Gradio interface iface = gr.Interface( fn=rag, inputs=["text"], outputs="text", title="Climate Change RAG (Using OpenAI)", description="Ask questions about the impact of climate change and get answers based on the provided document." ) # Launch the interface iface.launch(debug = True)