datboyalex commited on
Commit
bc94c3a
·
verified ·
1 Parent(s): 2d87315

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled8.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I
8
+ """
9
+
10
+ import os
11
+ import pandas as pd
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_openai import ChatOpenAI
14
+ from langchain_openai import OpenAIEmbeddings
15
+ from langchain.vectorstores import FAISS
16
+ from langchain.chains import RetrievalQA
17
+ import gradio as gr
18
+
19
+ # Step 1: Load the System Prompt
20
+ prompt_path = "system_prompt.txt" # Ensure this file is in the same directory
21
+ if not os.path.exists(prompt_path):
22
+ raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.")
23
+
24
+ with open(prompt_path, "r") as file:
25
+ system_prompt = file.read()
26
+
27
+ # Step 2: Load the Retrieval Database
28
+ csv_path = "retrievaldb.csv" # Ensure this file is in the same directory
29
+ if not os.path.exists(csv_path):
30
+ raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.")
31
+
32
+ # Load the CSV
33
+ df = pd.read_csv(csv_path)
34
+
35
+ # Step 3: Preprocess the Data
36
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
37
+ texts = []
38
+ metadatas = []
39
+
40
+ # Process each row to chunk text and attach metadata
41
+ for _, row in df.iterrows():
42
+ chunk_text = row.get("chunk_text", "")
43
+ if pd.notna(chunk_text):
44
+ chunks = text_splitter.split_text(chunk_text)
45
+ for chunk in chunks:
46
+ texts.append(chunk)
47
+ metadatas.append({
48
+ "source": row.get("content_source", "Unknown Source"),
49
+ "title": row.get("document_name", "Unknown Document"),
50
+ "page": row.get("page_number", "N/A"),
51
+ "topic": row.get("main_topic", "N/A"),
52
+ "week": row.get("metadata", "N/A")
53
+ })
54
+
55
+ if len(texts) != len(metadatas):
56
+ raise ValueError("Mismatch between texts and metadata after preprocessing.")
57
+
58
+ # Step 4: Create the Vector Store
59
+ embeddings = OpenAIEmbeddings()
60
+ vector_store = FAISS.from_texts(
61
+ texts=texts,
62
+ embedding=embeddings,
63
+ metadatas=metadatas
64
+ )
65
+
66
+ # Step 5: Initialize the LLM
67
+ openai_api_key = os.getenv("OPENAI_API_KEY") # Securely access the API key from Hugging Face Secrets
68
+ if not openai_api_key:
69
+ raise ValueError("OPENAI_API_KEY environment variable is not set. Please add it to the Space Secrets.")
70
+
71
+ llm = ChatOpenAI(
72
+ model_name="gpt-4o-mini",
73
+ temperature=0.7,
74
+ api_key=openai_api_key
75
+ )
76
+
77
+ # Step 6: Set Up the RetrievalQA Chain
78
+ retriever = vector_store.as_retriever(search_kwargs={"k": 5})
79
+ qa_chain = RetrievalQA.from_chain_type(
80
+ llm=llm,
81
+ chain_type="stuff", # Concatenates retrieved chunks for context
82
+ retriever=retriever,
83
+ return_source_documents=False # Do not include source documents in the response
84
+ )
85
+
86
+ # Step 7: Define Query Function
87
+ def query_bradtgpt(user_input):
88
+ # Add system prompt dynamically to the query
89
+ full_prompt = f"""
90
+ {system_prompt}
91
+
92
+ User: {user_input}
93
+ Assistant:
94
+ """
95
+ response = qa_chain({"query": full_prompt})
96
+ return response["result"] # Return the main answer only
97
+
98
+ # Step 8: Gradio Interface
99
+ def respond(message):
100
+ return query_bradtgpt(message)
101
+
102
+ demo = gr.Interface(
103
+ fn=respond,
104
+ inputs=gr.Textbox(
105
+ label="Your question",
106
+ placeholder="Ask BradGPT anything about CPSC 183!",
107
+ lines=3
108
+ ),
109
+ outputs=gr.Textbox(
110
+ label="Response",
111
+ lines=10
112
+ ),
113
+ title="BradGPT",
114
+ description="Ask BradGPT questions about CPSC 183 course readings or topics.",
115
+ theme="monochrome"
116
+ )
117
+
118
+ if __name__ == "__main__":
119
+ demo.launch()