kamkol commited on
Commit
abe7dd0
·
1 Parent(s): d0aa61e

Fix issue with process_data

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. Dockerfile +7 -0
  3. app/app.py +9 -19
  4. check_dependencies.py +95 -0
  5. process_data.py +7 -2
  6. requirements.txt +1 -2
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
Dockerfile CHANGED
@@ -11,6 +11,9 @@ RUN apt-get update && apt-get install -y \
11
  COPY requirements.txt .
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
 
 
 
14
  # Create necessary directories
15
  RUN mkdir -p data/processed_data
16
 
@@ -18,10 +21,14 @@ RUN mkdir -p data/processed_data
18
  COPY app/ ./app/
19
  COPY data/processed_data/chunks.pkl ./data/processed_data/
20
  COPY data/processed_data/embedded_docs.pkl ./data/processed_data/
 
21
 
22
  # Enable more verbose logging
23
  ENV PYTHONUNBUFFERED=1
24
 
 
 
 
25
  # Set the entry point to run the Streamlit app
26
  # Use debug.py to troubleshoot if the main app fails
27
  EXPOSE 8501
 
11
  COPY requirements.txt .
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
14
+ # Explicitly install sentence-transformers (sometimes needed for HF Spaces)
15
+ RUN pip install --no-cache-dir sentence-transformers==2.3.0
16
+
17
  # Create necessary directories
18
  RUN mkdir -p data/processed_data
19
 
 
21
  COPY app/ ./app/
22
  COPY data/processed_data/chunks.pkl ./data/processed_data/
23
  COPY data/processed_data/embedded_docs.pkl ./data/processed_data/
24
+ COPY check_dependencies.py ./
25
 
26
  # Enable more verbose logging
27
  ENV PYTHONUNBUFFERED=1
28
 
29
+ # Run a quick check to verify dependencies are installed correctly
30
+ RUN python check_dependencies.py || echo "Dependency check completed with warnings"
31
+
32
  # Set the entry point to run the Streamlit app
33
  # Use debug.py to troubleshoot if the main app fails
34
  EXPOSE 8501
app/app.py CHANGED
@@ -30,17 +30,6 @@ if not os.environ.get("OPENAI_API_KEY"):
30
  st.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
31
  st.stop()
32
 
33
- # Custom embedding model class
34
- class CustomEmbeddingModel:
35
- def __init__(self, model_name):
36
- self.model = SentenceTransformer(model_name)
37
-
38
- def embed_query(self, text):
39
- return self.model.encode(text)
40
-
41
- def embed_documents(self, texts):
42
- return self.model.encode(texts)
43
-
44
  # Custom vector store implementation
45
  class CustomVectorStore(VectorStore):
46
  def __init__(self, embedded_docs, embedding_model):
@@ -49,7 +38,7 @@ class CustomVectorStore(VectorStore):
49
 
50
  def similarity_search_with_score(self, query, k=5):
51
  # Get the query embedding
52
- query_embedding = self.embedding_model.embed_query(query)
53
 
54
  # Calculate similarity scores
55
  results = []
@@ -158,17 +147,18 @@ def initialize_vectorstore():
158
  embedded_docs = []
159
  raise RuntimeError(f"Error loading embedded_docs.pkl: {str(e)}")
160
 
 
 
 
 
 
 
 
 
161
  if not chunks or not embedded_docs:
162
  # Return empty vectorstore as fallback
163
- embedding_model = CustomEmbeddingModel("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
164
  vectorstore = CustomVectorStore([], embedding_model)
165
  return vectorstore, []
166
-
167
- # Initialize embedding model
168
- try:
169
- embedding_model = CustomEmbeddingModel("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
170
- except Exception as e:
171
- raise RuntimeError(f"Error initializing embedding model: {str(e)}")
172
 
173
  # Create custom vectorstore
174
  vectorstore = CustomVectorStore(embedded_docs, embedding_model)
 
30
  st.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
31
  st.stop()
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Custom vector store implementation
34
  class CustomVectorStore(VectorStore):
35
  def __init__(self, embedded_docs, embedding_model):
 
38
 
39
  def similarity_search_with_score(self, query, k=5):
40
  # Get the query embedding
41
+ query_embedding = self.embedding_model.encode(query)
42
 
43
  # Calculate similarity scores
44
  results = []
 
147
  embedded_docs = []
148
  raise RuntimeError(f"Error loading embedded_docs.pkl: {str(e)}")
149
 
150
+ # Initialize embedding model - use SentenceTransformer directly
151
+ model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
152
+ try:
153
+ embedding_model = SentenceTransformer(model_name)
154
+ except Exception as e:
155
+ print(f"Error loading model: {str(e)}")
156
+ raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")
157
+
158
  if not chunks or not embedded_docs:
159
  # Return empty vectorstore as fallback
 
160
  vectorstore = CustomVectorStore([], embedding_model)
161
  return vectorstore, []
 
 
 
 
 
 
162
 
163
  # Create custom vectorstore
164
  vectorstore = CustomVectorStore(embedded_docs, embedding_model)
check_dependencies.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to check installed dependencies for troubleshooting
4
+ """
5
+ import sys
6
+ import pkg_resources
7
+ import importlib
8
+ import platform
9
+
10
+ def check_imports():
11
+ """Check if critical imports work"""
12
+ success = True
13
+ critical_imports = [
14
+ "sentence_transformers",
15
+ "streamlit",
16
+ "langchain_core",
17
+ "langchain_openai",
18
+ "langgraph",
19
+ "numpy",
20
+ "scipy"
21
+ ]
22
+
23
+ print("\n=== Import Checks ===")
24
+ for module_name in critical_imports:
25
+ try:
26
+ module = importlib.import_module(module_name)
27
+ version = getattr(module, "__version__", "Unknown")
28
+ print(f"✓ {module_name}: {version}")
29
+ except ImportError as e:
30
+ success = False
31
+ print(f"✗ {module_name}: FAILED - {str(e)}")
32
+ return success
33
+
34
+ def check_sentence_transformer():
35
+ """Check if SentenceTransformer works properly"""
36
+ print("\n=== SentenceTransformer Check ===")
37
+ try:
38
+ from sentence_transformers import SentenceTransformer
39
+ print(f"SentenceTransformer version: {SentenceTransformer.__version__}")
40
+
41
+ # Try loading a lightweight default model to see if it works
42
+ print("Testing with a default model...")
43
+ model = SentenceTransformer('all-MiniLM-L6-v2')
44
+ test_embedding = model.encode("Test sentence to check if embeddings work")
45
+ print(f"✓ Successfully created embedding with shape: {test_embedding.shape}")
46
+ return True
47
+ except Exception as e:
48
+ print(f"✗ SentenceTransformer test failed: {str(e)}")
49
+ return False
50
+
51
+ def print_system_info():
52
+ """Print system information"""
53
+ print("\n=== System Information ===")
54
+ print(f"Python version: {sys.version}")
55
+ print(f"Platform: {platform.platform()}")
56
+ print(f"Implementation: {platform.python_implementation()}")
57
+
58
+ def main():
59
+ """Main function to run checks"""
60
+ print("=== Dependency Check ===")
61
+ print_system_info()
62
+
63
+ # Get all installed packages
64
+ installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
65
+
66
+ # Check for specific dependencies
67
+ key_packages = [
68
+ "sentence-transformers",
69
+ "streamlit",
70
+ "langchain-core",
71
+ "langchain-openai",
72
+ "langgraph",
73
+ "numpy",
74
+ "scipy"
75
+ ]
76
+
77
+ print("\n=== Package Versions ===")
78
+ for pkg in key_packages:
79
+ version = installed_packages.get(pkg, "Not installed")
80
+ print(f"{pkg}: {version}")
81
+
82
+ # Test imports
83
+ imports_ok = check_imports()
84
+
85
+ # Test SentenceTransformer
86
+ st_ok = check_sentence_transformer()
87
+
88
+ # Final result
89
+ if imports_ok and st_ok:
90
+ print("\n✓ All critical checks passed")
91
+ else:
92
+ print("\n✗ Some checks failed, see details above")
93
+
94
+ if __name__ == "__main__":
95
+ main()
process_data.py CHANGED
@@ -150,8 +150,13 @@ def process_pdfs():
150
  with open(processed_data_dir / "chunks.pkl", "wb") as f:
151
  pickle.dump(split_chunks, f)
152
 
153
- # Initialize embedding model
154
- embedding_model = SentenceTransformer("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
 
 
 
 
 
155
 
156
  print("Embedding document chunks (this may take a while)...")
157
  # Create a dictionary to store documents and their embeddings
 
150
  with open(processed_data_dir / "chunks.pkl", "wb") as f:
151
  pickle.dump(split_chunks, f)
152
 
153
+ # Initialize embedding model using SentenceTransformer directly
154
+ try:
155
+ embedding_model = SentenceTransformer("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
156
+ print("Successfully loaded SentenceTransformer model")
157
+ except Exception as e:
158
+ print(f"Error loading model: {str(e)}")
159
+ raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")
160
 
161
  print("Embedding document chunks (this may take a while)...")
162
  # Create a dictionary to store documents and their embeddings
requirements.txt CHANGED
@@ -10,5 +10,4 @@ tiktoken>=0.6.0
10
  python-dotenv>=1.0.1
11
  qdrant-client>=1.7.0
12
  scipy>=1.10.0
13
- langchain-huggingface>=0.0.2
14
- sentence-transformers>=2.3.0
 
10
  python-dotenv>=1.0.1
11
  qdrant-client>=1.7.0
12
  scipy>=1.10.0
13
+ sentence-transformers==2.3.0