menikev commited on
Commit
4768c21
·
verified ·
1 Parent(s): befecdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -20
app.py CHANGED
@@ -4,7 +4,7 @@ from langchain_community.vectorstores import FAISS
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.chains import RetrievalQA
6
  from langchain_community.llms import HuggingFaceHub
7
- from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
8
 
9
  # You can use this section to suppress warnings generated by your code:
10
  def warn(*args, **kwargs):
@@ -39,42 +39,98 @@ def document_loader(file_path):
39
  """
40
  Loads a PDF document from the given file path.
41
  """
42
- loader = PyPDFLoader(file_path)
43
- loaded_document = loader.load()
44
- return loaded_document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  ## Text splitter
47
  def text_splitter(data):
48
  """
49
  Splits the loaded document into smaller chunks for processing.
50
  """
51
- text_splitter = RecursiveCharacterTextSplitter(
52
- chunk_size=1000,
53
- chunk_overlap=200,
54
- length_function=len,
55
- )
56
- chunks = text_splitter.split_documents(data)
57
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  ## Vector db and Embedding model
60
  def vector_database(chunks):
61
  """
62
  Creates a FAISS vector database from the document chunks using a
63
- Hugging Face embeddings model.
64
  """
65
- # Fixed: Using proper parameter name for HuggingFaceInferenceAPIEmbeddings
66
- embedding_model = HuggingFaceInferenceAPIEmbeddings(
67
- api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
68
- model_name="sentence-transformers/all-MiniLM-L6-v2"
69
- )
70
-
71
- # Add error handling for embedding creation
72
  try:
 
 
 
 
 
 
 
 
 
 
73
  vectordb = FAISS.from_documents(chunks, embedding_model)
 
74
  return vectordb
 
75
  except Exception as e:
76
  print(f"Error creating vector database: {e}")
77
- raise ValueError(f"Failed to create embeddings: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  ## Retriever
80
  def retriever(file_path):
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.chains import RetrievalQA
6
  from langchain_community.llms import HuggingFaceHub
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
 
9
  # You can use this section to suppress warnings generated by your code:
10
  def warn(*args, **kwargs):
 
39
  """
40
  Loads a PDF document from the given file path.
41
  """
42
+ try:
43
+ loader = PyPDFLoader(file_path)
44
+ loaded_document = loader.load()
45
+
46
+ # Check if document was loaded successfully
47
+ if not loaded_document:
48
+ raise ValueError("No content could be extracted from the PDF")
49
+
50
+ print(f"Successfully loaded {len(loaded_document)} pages from PDF")
51
+
52
+ # Check if pages have content
53
+ total_content = sum(len(doc.page_content.strip()) for doc in loaded_document)
54
+ if total_content == 0:
55
+ raise ValueError("PDF appears to be empty or contains no extractable text")
56
+
57
+ print(f"Total content length: {total_content} characters")
58
+ return loaded_document
59
+
60
+ except Exception as e:
61
+ print(f"Error loading document: {e}")
62
+ raise ValueError(f"Failed to load PDF: {e}")
63
 
64
  ## Text splitter
65
  def text_splitter(data):
66
  """
67
  Splits the loaded document into smaller chunks for processing.
68
  """
69
+ try:
70
+ text_splitter = RecursiveCharacterTextSplitter(
71
+ chunk_size=1000,
72
+ chunk_overlap=200,
73
+ length_function=len,
74
+ separators=["\n\n", "\n", " ", ""]
75
+ )
76
+ chunks = text_splitter.split_documents(data)
77
+
78
+ # Filter out very small chunks
79
+ filtered_chunks = [chunk for chunk in chunks if len(chunk.page_content.strip()) > 50]
80
+
81
+ print(f"Created {len(filtered_chunks)} chunks (filtered from {len(chunks)} total)")
82
+
83
+ if not filtered_chunks:
84
+ raise ValueError("No meaningful content chunks could be created from the document")
85
+
86
+ return filtered_chunks
87
+
88
+ except Exception as e:
89
+ print(f"Error in text splitting: {e}")
90
+ raise ValueError(f"Failed to split document into chunks: {e}")
91
 
92
  ## Vector db and Embedding model
93
  def vector_database(chunks):
94
  """
95
  Creates a FAISS vector database from the document chunks using a
96
+ local Hugging Face embeddings model.
97
  """
 
 
 
 
 
 
 
98
  try:
99
+ # Using local embeddings model (more reliable than API-based)
100
+ embedding_model = HuggingFaceEmbeddings(
101
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
102
+ model_kwargs={'device': 'cpu'}, # Use CPU for compatibility
103
+ encode_kwargs={'normalize_embeddings': True}
104
+ )
105
+
106
+ print(f"Processing {len(chunks)} chunks for embedding...")
107
+
108
+ # Create vector database
109
  vectordb = FAISS.from_documents(chunks, embedding_model)
110
+ print("Vector database created successfully!")
111
  return vectordb
112
+
113
  except Exception as e:
114
  print(f"Error creating vector database: {e}")
115
+ print(f"Error type: {type(e)}")
116
+ # Try alternative approach with text extraction
117
+ try:
118
+ print("Trying alternative approach with text extraction...")
119
+ texts = [chunk.page_content for chunk in chunks]
120
+ metadatas = [chunk.metadata for chunk in chunks]
121
+
122
+ embedding_model = HuggingFaceEmbeddings(
123
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
124
+ model_kwargs={'device': 'cpu'}
125
+ )
126
+
127
+ vectordb = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)
128
+ print("Alternative approach succeeded!")
129
+ return vectordb
130
+
131
+ except Exception as e2:
132
+ print(f"Alternative approach also failed: {e2}")
133
+ raise ValueError(f"Failed to create embeddings. Original error: {e}. Alternative error: {e2}")
134
 
135
  ## Retriever
136
  def retriever(file_path):