Spaces:

raksama19
/

chatbot

Sleeping

File size: 5,184 Bytes

39d67a2

import fitz  # PyMuPDF
import json
import os
import re
from sentence_transformers import SentenceTransformer
import pickle

class PDFProcessor:
    def __init__(self, pdf_directory="/Users/maraksa/Downloads/chatbot/WebAIM/"):
        self.pdf_directory = pdf_directory
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Check if directory exists
        if not os.path.exists(pdf_directory):
            os.makedirs(pdf_directory)
            print(f"Created directory: {pdf_directory}")
            print("Please add your WebAIM PDF files to this directory.")
        
    def clean_text(self, text):
        """Clean extracted text from PDF"""
        # Remove extra whitespace and line breaks
        text = re.sub(r'\s+', ' ', text)
        
        # Remove common PDF artifacts
        text = re.sub(r'Page \d+ of \d+', '', text)
        text = re.sub(r'WebAIM.*?\n', '', text)
        
        return text.strip()
        
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF with page information"""
        print(f"Processing: {os.path.basename(pdf_path)}")
        doc = fitz.open(pdf_path)
        pages_content = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            
            # Clean the text
            cleaned_text = self.clean_text(text)
            
            # Skip pages with very little content
            if len(cleaned_text) < 50:
                continue
            
            # Clean and chunk text
            chunks = self.chunk_text(cleaned_text, chunk_size=500)
            
            for chunk_idx, chunk in enumerate(chunks):
                if len(chunk.strip()) > 30:  # Only keep substantial chunks
                    pages_content.append({
                        'text': chunk,
                        'source_file': os.path.basename(pdf_path),
                        'page_number': page_num + 1,
                        'chunk_id': chunk_idx,
                        'source_type': 'WebAIM'
                    })
        
        doc.close()
        print(f"✅ Extracted {len(pages_content)} chunks from {os.path.basename(pdf_path)}")
        return pages_content
    
    def chunk_text(self, text, chunk_size=500, overlap=50):
        """Split text into overlapping chunks"""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk.strip():
                chunks.append(chunk.strip())
        
        return chunks
    
    def process_all_pdfs(self):
        """Process all PDFs in the directory"""
        all_content = []
        
        # Check if PDFs exist
        pdf_files = [f for f in os.listdir(self.pdf_directory) if f.endswith('.pdf')]
        
        if not pdf_files:
            print(f"❌ No PDF files found in {self.pdf_directory}")
            print("Please add your WebAIM PDF files to the pdfs/ directory")
            return []
        
        print(f"Found {len(pdf_files)} PDF files:")
        for pdf_file in pdf_files:
            print(f"  - {pdf_file}")
        
        for filename in pdf_files:
            pdf_path = os.path.join(self.pdf_directory, filename)
            try:
                content = self.extract_text_from_pdf(pdf_path)
                all_content.extend(content)
            except Exception as e:
                print(f"❌ Error processing {filename}: {str(e)}")
        
        return all_content
    
    def create_knowledge_base(self, output_path="knowledge_base.json"):
        """Create searchable knowledge base from PDFs"""
        print("🚀 Starting PDF processing...")
        all_content = self.process_all_pdfs()
        
        if not all_content:
            print("❌ No content extracted. Please check your PDF files.")
            return None
        
        print(f"📄 Total chunks extracted: {len(all_content)}")
        print("🧠 Creating embeddings... (this may take a few minutes)")
        
        texts = [item['text'] for item in all_content]
        embeddings = self.embedder.encode(texts, show_progress_bar=True)
        
        # Save knowledge base
        knowledge_base = {
            'content': all_content,
            'embeddings': embeddings.tolist(),
            'metadata': {
                'total_chunks': len(all_content),
                'embedding_model': 'all-MiniLM-L6-v2',
                'chunk_size': 500,
                'overlap': 50
            }
        }
        
        with open(output_path, 'w') as f:
            json.dump(knowledge_base, f, indent=2)
        
        print(f"✅ Knowledge base saved to {output_path}")
        print(f"📊 Summary:")
        print(f"   - Total chunks: {len(all_content)}")
        print(f"   - Embedding dimensions: {len(embeddings[0])}")
        print(f"   - File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
        
        return knowledge_base

# Usage
if __name__ == "__main__":
    processor = PDFProcessor()
    knowledge_base = processor.create_knowledge_base()