nivakaran commited on
Commit
d68a3d3
·
verified ·
1 Parent(s): 388931a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +20 -23
src/streamlit_app.py CHANGED
@@ -15,48 +15,46 @@ from langchain_huggingface import HuggingFaceEmbeddings
15
  from langchain_text_splitters import RecursiveCharacterTextSplitter
16
  from langchain_community.document_loaders import PyPDFLoader
17
  from langchain_chroma import Chroma
 
18
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
- # Set up proper cache directories for HuggingFace Spaces
24
  def setup_environment():
25
- # Create cache directories in a writable location
26
- cache_dir = Path("/tmp/cache") # Using /tmp which is writable in HuggingFace Spaces
27
  cache_dir.mkdir(exist_ok=True)
28
-
29
- # Set environment variables
30
- os.environ['STREAMLIT_HOME'] = str(cache_dir / "streamlit")
31
  os.environ['HF_HOME'] = str(cache_dir / "huggingface")
32
- os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / "transformers")
33
- os.environ['XDG_CACHE_HOME'] = str(cache_dir)
34
-
35
- # Ensure subdirectories exist
36
- (cache_dir / "huggingface").mkdir(exist_ok=True)
37
- (cache_dir / "streamlit").mkdir(exist_ok=True)
38
- (cache_dir / "transformers").mkdir(exist_ok=True)
39
 
40
  setup_environment()
41
 
42
  # Load environment variables
43
  load_dotenv()
44
- HF_TOKEN = os.getenv("HF_TOKEN")
45
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
46
- PDF_PATH = os.getenv("PDF_PATH", "./nivakaran.pdf")
47
 
48
  # Validate environment variables
49
- if not all([HF_TOKEN, GROQ_API_KEY, PDF_PATH]):
50
  st.error("Missing required environment variables")
51
  st.stop()
52
 
53
- # Initialize RAG components with proper cache handling
 
 
 
 
 
54
  try:
 
 
 
 
55
  embeddings = HuggingFaceEmbeddings(
56
  model_name="sentence-transformers/all-MiniLM-L6-v2",
57
  model_kwargs={'device': 'cpu'},
58
- encode_kwargs={'normalize_embeddings': True},
59
- cache_folder=os.environ['HF_HOME']
60
  )
61
  except Exception as e:
62
  logger.error(f"Failed to initialize embeddings: {str(e)}")
@@ -64,7 +62,6 @@ except Exception as e:
64
  st.stop()
65
 
66
  llm = ChatGroq(model_name="Deepseek-R1-Distill-Llama-70b", temperature=0.1)
67
- session_store = {}
68
 
69
  # Process PDF into vectorstore
70
  def process_pdf(file_path: str):
@@ -74,12 +71,10 @@ def process_pdf(file_path: str):
74
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
75
  splits = text_splitter.split_documents(documents)
76
 
77
- # Use temporary directory for Chroma DB
78
- chroma_dir = "/tmp/chroma_db"
79
  vectorstore = Chroma.from_documents(
80
  documents=splits,
81
  embedding=embeddings,
82
- persist_directory=chroma_dir
83
  )
84
  logger.info(f"PDF {file_path} processed successfully")
85
  return vectorstore
@@ -97,6 +92,8 @@ except Exception as e:
97
  st.error("Failed to initialize document store. Please try again later.")
98
  st.stop()
99
 
 
 
100
  # System prompt for the assistant
101
  system_prompt = """You are Max, a friendly and professional chatbot designed to
102
  assist visitors to Nivakaran's portfolio website. Your primary goal
 
15
  from langchain_text_splitters import RecursiveCharacterTextSplitter
16
  from langchain_community.document_loaders import PyPDFLoader
17
  from langchain_chroma import Chroma
18
+ import torch
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
23
 
24
+ # Set up proper cache directories
25
  def setup_environment():
26
+ cache_dir = Path("/tmp/cache")
 
27
  cache_dir.mkdir(exist_ok=True)
 
 
 
28
  os.environ['HF_HOME'] = str(cache_dir / "huggingface")
29
+ os.environ['STREAMLIT_HOME'] = str(cache_dir / "streamlit")
 
 
 
 
 
 
30
 
31
  setup_environment()
32
 
33
  # Load environment variables
34
  load_dotenv()
 
35
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
36
+ PDF_PATH = os.getenv("PDF_PATH", "nivakaran.pdf") # Changed to direct filename
37
 
38
  # Validate environment variables
39
+ if not all([GROQ_API_KEY]):
40
  st.error("Missing required environment variables")
41
  st.stop()
42
 
43
+ # Verify PDF exists
44
+ if not Path(PDF_PATH).exists():
45
+ st.error(f"PDF file not found at: {PDF_PATH}")
46
+ st.stop()
47
+
48
+ # Initialize RAG components with proper device handling
49
  try:
50
+ # Force CPU and disable metal for sentence-transformers
51
+ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
52
+ os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
53
+
54
  embeddings = HuggingFaceEmbeddings(
55
  model_name="sentence-transformers/all-MiniLM-L6-v2",
56
  model_kwargs={'device': 'cpu'},
57
+ encode_kwargs={'normalize_embeddings': True}
 
58
  )
59
  except Exception as e:
60
  logger.error(f"Failed to initialize embeddings: {str(e)}")
 
62
  st.stop()
63
 
64
  llm = ChatGroq(model_name="Deepseek-R1-Distill-Llama-70b", temperature=0.1)
 
65
 
66
  # Process PDF into vectorstore
67
  def process_pdf(file_path: str):
 
71
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
72
  splits = text_splitter.split_documents(documents)
73
 
 
 
74
  vectorstore = Chroma.from_documents(
75
  documents=splits,
76
  embedding=embeddings,
77
+ persist_directory="/tmp/chroma_db"
78
  )
79
  logger.info(f"PDF {file_path} processed successfully")
80
  return vectorstore
 
92
  st.error("Failed to initialize document store. Please try again later.")
93
  st.stop()
94
 
95
+ # [Rest of your existing Streamlit UI code remains the same...]
96
+
97
  # System prompt for the assistant
98
  system_prompt = """You are Max, a friendly and professional chatbot designed to
99
  assist visitors to Nivakaran's portfolio website. Your primary goal