Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.agents import Tool, AgentExecutor | |
| from langchain.tools.retriever import create_retriever_tool | |
| from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import AzureOpenAIEmbeddings | |
| from langchain_community.chat_models import AzureChatOpenAI | |
| from openai import AzureOpenAI | |
| import warnings | |
| # Load environment variables | |
| load_dotenv() | |
| AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | |
| AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
| AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT") | |
| AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT") | |
| if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]): | |
| raise ValueError("Missing one or more Azure OpenAI environment variables.") | |
| warnings.filterwarnings("ignore") | |
| AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | |
| if not AZURE_OPENAI_API_KEY: | |
| raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.") | |
| chunk_size = 500 | |
| # Extract Data from the PDFs | |
| def load_pdf_file(data_path): | |
| loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader) | |
| documents = loader.load() | |
| return documents | |
| # Split the data into chunks | |
| def text_split(docs): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20) | |
| return splitter.split_documents(docs) | |
| # Set up LLM and Embedding | |
| llm = AzureChatOpenAI( | |
| deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT, | |
| azure_endpoint=AZURE_OPENAI_ENDPOINT, | |
| openai_api_key=AZURE_OPENAI_API_KEY, | |
| openai_api_version="2023-12-01-preview" # or your supported version | |
| # temperature=0.5 # Only if supported by your deployment | |
| ) | |
| embeddings = AzureOpenAIEmbeddings( | |
| azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, | |
| azure_endpoint=AZURE_OPENAI_ENDPOINT, | |
| openai_api_key=AZURE_OPENAI_API_KEY, | |
| openai_api_version="2023-12-01-preview", | |
| chunk_size=chunk_size # or another value up to 2048 | |
| ) | |
| # Load PDF, chunk it, embed it, and store in FAISS | |
| pdf_docs = load_pdf_file("Dataset/") # Update this to your PDF folder | |
| chunks = text_split(pdf_docs) | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| vectorstore.save_local("faiss_index_sysml") | |
| # Load FAISS and create retriever QA chain | |
| # new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True) | |
| # qa = RetrievalQA.from_chain_type( | |
| # llm=llm, | |
| # chain_type="stuff", | |
| # retriever=new_vectorstore.as_retriever() | |
| # ) | |
| # # Run a sample query | |
| # query = "What is SysML used for?" | |
| # print("User:", query) | |
| # print("Bot:", qa.run(query)) | |