Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| from pprint import pprint | |
| from pypdf import PdfReader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter | |
| import chromadb | |
| import openai | |
| from openai import OpenAI | |
| import gradio as gr | |
| # Setup environment variables and OpenAI API key | |
| os.environ['OPENAI_API_KEY'] = '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' | |
| openai.api_key = os.environ["OPENAI_API_KEY"] | |
| # Load and preprocess the PDF document | |
| ipcc_report_file = "IPCC_AR6_WGII_TechnicalSummary (1).pdf" | |
| reader = PdfReader(ipcc_report_file) | |
| ipcc_texts = [page.extract_text().strip() for page in reader.pages] | |
| # Filter out beginning and end of document | |
| ipcc_texts_filt = ipcc_texts[5: -5] | |
| # Remove all header / footer texts | |
| ipcc_wo_header_footer = [re.sub(r'\d+\nTechnical Summary', '', s) for s in ipcc_texts_filt] | |
| ipcc_wo_header_footer = [re.sub(r'\nTS', '', s) for s in ipcc_wo_header_footer] | |
| ipcc_wo_header_footer = [re.sub(r'TS\n', '', s) for s in ipcc_wo_header_footer] | |
| # Split the document text into chunks | |
| char_splitter = RecursiveCharacterTextSplitter( | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| chunk_size=1000, | |
| chunk_overlap=0.2 | |
| ) | |
| texts_char_splitted = char_splitter.split_text('\n\n'.join(ipcc_wo_header_footer)) | |
| # Split text into tokens | |
| token_splitter = SentenceTransformersTokenTextSplitter( | |
| chunk_overlap=0.2, | |
| tokens_per_chunk=256 | |
| ) | |
| texts_token_splitted = [] | |
| for text in texts_char_splitted: | |
| try: | |
| texts_token_splitted.extend(token_splitter.split_text(text)) | |
| except Exception as e: | |
| print(f"Error in text: {text}, {e}") | |
| continue | |
| # Create and populate the vector database | |
| chroma_client = chromadb.PersistentClient(path="db") | |
| chroma_collection = chroma_client.get_or_create_collection("ipcc") | |
| ids = [str(i) for i in range(len(texts_token_splitted))] | |
| chroma_collection.add( | |
| ids=ids, | |
| documents=texts_token_splitted | |
| ) | |
| # Define the rag function based on the notebook content | |
| def rag(query, n_results=5): | |
| res = chroma_collection.query(query_texts=[query], n_results=n_results) | |
| docs = res["documents"][0] | |
| joined_information = ';'.join([f'{doc}' for doc in docs]) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful expert on climate change. Your users are asking questions about information contained in attached information." | |
| "You will be shown the user's question, and the relevant information. Answer the user's question using only this information." | |
| }, | |
| {"role": "user", "content": f"Question: {query}. \n Information: {joined_information}"} | |
| ] | |
| openai_client = OpenAI() | |
| model = "gpt-3.5-turbo" | |
| response = openai_client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| ) | |
| content = response.choices[0].message.content | |
| return content | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=rag, | |
| inputs=["text"], | |
| outputs="text", | |
| title="Climate Change RAG (Using OpenAI)", | |
| description="Ask questions about the impact of climate change and get answers based on the provided document." | |
| ) | |
| # Launch the interface | |
| iface.launch(debug = True) | |