Spaces:

yekkala
/

Homoeopathy-Bot

Sleeping

File size: 3,023 Bytes

099c54b

"""
Process uploaded documents for knowledge base
"""
import os
import PyPDF2
import docx
from typing import Dict, List
import hashlib

class DocumentProcessor:
    def __init__(self, upload_dir: str = "knowledge_base/internal_docs"):
        self.upload_dir = upload_dir
        os.makedirs(upload_dir, exist_ok=True)
    
    def process_uploaded_file(self, uploaded_file) -> Dict:
        """Process uploaded document and extract text"""
        file_hash = hashlib.md5(uploaded_file.getvalue()).hexdigest()
        file_path = os.path.join(self.upload_dir, f"{file_hash}_{uploaded_file.name}")
        
        # Save the file
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        
        # Extract text based on file type
        text = ""
        if uploaded_file.name.endswith('.pdf'):
            text = self._extract_pdf_text(file_path)
        elif uploaded_file.name.endswith('.docx'):
            text = self._extract_docx_text(file_path)
        elif uploaded_file.name.endswith('.txt'):
            text = uploaded_file.getvalue().decode('utf-8')
        
        return {
            'filename': uploaded_file.name,
            'file_hash': file_hash,
            'file_path': file_path,
            'text': text[:5000],  # Limit text for processing
            'word_count': len(text.split()),
            'status': 'processed'
        }
    
    def _extract_pdf_text(self, file_path: str) -> str:
        """Extract text from PDF file"""
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Error reading PDF: {e}")
        return text
    
    def _extract_docx_text(self, file_path: str) -> str:
        """Extract text from DOCX file"""
        text = ""
        try:
            doc = docx.Document(file_path)
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
        except Exception as e:
            print(f"Error reading DOCX: {e}")
        return text
    
    def search_in_documents(self, query: str, documents: List[Dict]) -> List[Dict]:
        """Search for query in processed documents"""
        results = []
        query_lower = query.lower()
        
        for doc in documents:
            if query_lower in doc['text'].lower():
                # Find context around the match
                idx = doc['text'].lower().find(query_lower)
                start = max(0, idx - 100)
                end = min(len(doc['text']), idx + len(query) + 100)
                context = doc['text'][start:end]
                
                results.append({
                    'document': doc['filename'],
                    'match': query,
                    'context': f"...{context}...",
                    'relevance': 1.0
                })
        
        return results