File size: 7,137 Bytes
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
import logging

# For PDF processing - now using LangChain's PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader

# Configure logging
logger = logging.getLogger(__name__)

# --- Existing functions (as provided by user) ---

# Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py
# In a real application, ensure config.py is accessible or pass this path.
try:
    from config import COMPANY_INFO_DIR
except ImportError:
    logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.")
    COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed

def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]:
    """
    Load and process FAQ documents from CSV file.
    
    Args:
        faq_path: Path to the FAQ CSV file
        
    Returns:
        List of Document objects
    """
    try:
        # Validate file exists
        if not faq_path.exists():
            raise FileNotFoundError(f"FAQ file not found at {faq_path}")
            
        df = pd.read_csv(faq_path)
        
        # Validate required columns
        required_cols = ['Question', 'Answer']
        if not all(col in df.columns for col in required_cols):
            raise ValueError(f"CSV must contain columns: {required_cols}")
            
        documents = []
        for idx, row in df.iterrows():
            content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}"
            
            doc = Document(
                page_content=content,
                metadata={
                    "source": "company_faq",
                    "type": "faq", 
                    "doc_id": f"{idx}", 
                    "filename": faq_path.name
                }
            )
            documents.append(doc)
            
        logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}")
        return documents
        
    except Exception as e:
        logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}")
        raise


def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document:
    """
    Load company information from markdown file.
    
    Args:
        info_path: Path to the company info markdown file
        
    Returns:
        Document object containing company info
    """
    try:
        # Validate file exists
        if not info_path.exists():
            raise FileNotFoundError(f"Info file not found at {info_path}")
            
        with open(info_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        doc = Document(
            page_content=content,
            metadata={
                "source": "company_info",
                "type": "general_info",
                "filename": info_path.name,
                "doc_id": "company_info_main"
            }
        )
        logger.info(f"Loaded company info document from {info_path.name}")
        return doc
        
    except Exception as e:
        logger.error(f"Error loading company info from {info_path.name}: {str(e)}")
        raise

# --- New functions for PDF, TXT, and Image loading ---

def load_pdf_document(file_path: Path) -> List[Document]:
    """
    Load text from a PDF file using LangChain's PyPDFLoader.
    Each page is treated as a separate document.
    
    Args:
        file_path: Path to the PDF file.
        
    Returns:
        A list of Document objects, one for each page of the PDF.
    """
    documents = []
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"PDF file not found at {file_path}")
        
        loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path
        docs = loader.load() # This returns a list of LangChain Document objects

        # Enhance metadata for consistency and add source/type
        for doc in docs:
            doc.metadata["source"] = "uploaded_file"
            doc.metadata["type"] = "pdf"
            doc.metadata["filename"] = file_path.name
            # PyPDFLoader usually adds 'page' and 'source' (which is the file path)
            # We can use the existing 'page' if it's there or default to 0
            page_num = doc.metadata.get("page", 0) 
            doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed

        documents.extend(docs)
        
        logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}")
        return documents
    except Exception as e:
        logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}")
        raise

def load_txt_document(file_path: Path) -> Document:
    """
    Load text from a TXT file.
    
    Args:
        file_path: Path to the TXT file.
        
    Returns:
        A Document object containing the text from the file.
    """
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"TXT file not found at {file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        doc = Document(
            page_content=content,
            metadata={
                "source": "uploaded_file",
                "type": "txt",
                "filename": file_path.name,
                "doc_id": file_path.stem
            }
        )
        logger.info(f"Loaded TXT file: {file_path.name}")
        return doc
    except Exception as e:
        logger.error(f"Error loading TXT file {file_path.name}: {str(e)}")
        raise


def process_uploaded_file(file_path: Path) -> List[Document]:
    """
    Determines the file extension and calls the appropriate function to process it.
    
    Args:
        file_path: Path to the uploaded file.
        
    Returns:
        A list of Document objects containing the extracted text.
        Returns an empty list if the file type is unsupported or an error occurs.
    """
    documents = []
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"File not found at {file_path}")

        extension = file_path.suffix.lower()

        if extension == '.pdf':
            documents = load_pdf_document(file_path)
        elif extension == '.txt':
            documents = [load_txt_document(file_path)] # Wrap in list for consistency
        else:
            logger.warning(f"Unsupported file type for {file_path.name}: {extension}")
            # Optionally, you could raise an error here if unsupported files should halt execution
            # raise ValueError(f"Unsupported file type: {extension}")
            return [] # Return empty list for unsupported types
            
    except FileNotFoundError as fnfe:
        logger.error(f"Processing failed: {fnfe}")
    except Exception as e:
        logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}")
    
    return documents