Spaces:

menikev
/

KnowYourConstitutionBot

Sleeping

App Files Files Community

menikev commited on Aug 20, 2025

Commit

4768c21

verified ·

1 Parent(s): befecdb

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -20

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFaceHub
-from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 # You can use this section to suppress warnings generated by your code:
 def warn(*args, **kwargs):
@@ -39,42 +39,98 @@ def document_loader(file_path):
     """
     Loads a PDF document from the given file path.
     """
-    loader = PyPDFLoader(file_path)
-    loaded_document = loader.load()
-    return loaded_document
 ## Text splitter
 def text_splitter(data):
     """
     Splits the loaded document into smaller chunks for processing.
     """
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len,
-    )
-    chunks = text_splitter.split_documents(data)
-    return chunks
 ## Vector db and Embedding model
 def vector_database(chunks):
     """
     Creates a FAISS vector database from the document chunks using a
-    Hugging Face embeddings model.
     """
-    # Fixed: Using proper parameter name for HuggingFaceInferenceAPIEmbeddings
-    embedding_model = HuggingFaceInferenceAPIEmbeddings(
-        api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"],
-        model_name="sentence-transformers/all-MiniLM-L6-v2"
-    )
-    # Add error handling for embedding creation
     try:
         vectordb = FAISS.from_documents(chunks, embedding_model)
         return vectordb
     except Exception as e:
         print(f"Error creating vector database: {e}")
-        raise ValueError(f"Failed to create embeddings: {e}")
 ## Retriever
 def retriever(file_path):

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFaceHub
+from langchain_community.embeddings import HuggingFaceEmbeddings
 # You can use this section to suppress warnings generated by your code:
 def warn(*args, **kwargs):
     """
     Loads a PDF document from the given file path.
     """
+    try:
+        loader = PyPDFLoader(file_path)
+        loaded_document = loader.load()
+        # Check if document was loaded successfully
+        if not loaded_document:
+            raise ValueError("No content could be extracted from the PDF")
+        print(f"Successfully loaded {len(loaded_document)} pages from PDF")
+        # Check if pages have content
+        total_content = sum(len(doc.page_content.strip()) for doc in loaded_document)
+        if total_content == 0:
+            raise ValueError("PDF appears to be empty or contains no extractable text")
+        print(f"Total content length: {total_content} characters")
+        return loaded_document
+    except Exception as e:
+        print(f"Error loading document: {e}")
+        raise ValueError(f"Failed to load PDF: {e}")
 ## Text splitter
 def text_splitter(data):
     """
     Splits the loaded document into smaller chunks for processing.
     """
+    try:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        chunks = text_splitter.split_documents(data)
+        # Filter out very small chunks
+        filtered_chunks = [chunk for chunk in chunks if len(chunk.page_content.strip()) > 50]
+        print(f"Created {len(filtered_chunks)} chunks (filtered from {len(chunks)} total)")
+        if not filtered_chunks:
+            raise ValueError("No meaningful content chunks could be created from the document")
+        return filtered_chunks
+    except Exception as e:
+        print(f"Error in text splitting: {e}")
+        raise ValueError(f"Failed to split document into chunks: {e}")
 ## Vector db and Embedding model
 def vector_database(chunks):
     """
     Creates a FAISS vector database from the document chunks using a
+    local Hugging Face embeddings model.
     """
     try:
+        # Using local embeddings model (more reliable than API-based)
+        embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={'device': 'cpu'},  # Use CPU for compatibility
+            encode_kwargs={'normalize_embeddings': True}
+        )
+        print(f"Processing {len(chunks)} chunks for embedding...")
+        # Create vector database
         vectordb = FAISS.from_documents(chunks, embedding_model)
+        print("Vector database created successfully!")
         return vectordb
     except Exception as e:
         print(f"Error creating vector database: {e}")
+        print(f"Error type: {type(e)}")
+        # Try alternative approach with text extraction
+        try:
+            print("Trying alternative approach with text extraction...")
+            texts = [chunk.page_content for chunk in chunks]
+            metadatas = [chunk.metadata for chunk in chunks]
+            embedding_model = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2",
+                model_kwargs={'device': 'cpu'}
+            )
+            vectordb = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)
+            print("Alternative approach succeeded!")
+            return vectordb
+        except Exception as e2:
+            print(f"Alternative approach also failed: {e2}")
+            raise ValueError(f"Failed to create embeddings. Original error: {e}. Alternative error: {e2}")
 ## Retriever
 def retriever(file_path):