Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Apr 6

Commit

01330c2

verified ·

1 Parent(s): 97f878b

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

advanced_rag.py +99 -23

advanced_rag.py CHANGED Viewed

@@ -397,31 +397,25 @@ def load_txt_from_url(url: str) -> Document:
 from pdfminer.high_level import extract_text
 from langchain_core.documents import Document
 def get_confirm_token(response):
     for key, value in response.cookies.items():
         if key.startswith("download_warning"):
             return value
     return None
 def download_file_from_google_drive(file_id, destination):
     """
     Download a file from Google Drive handling large file confirmation.
     """
     URL = "https://docs.google.com/uc?export=download&confirm=1"
     session = requests.Session()
     response = session.get(URL, params={"id": file_id}, stream=True)
     token = get_confirm_token(response)
     if token:
         params = {"id": file_id, "confirm": token}
         response = session.get(URL, params=params, stream=True)
     save_response_content(response, destination)
 def save_response_content(response, destination):
     CHUNK_SIZE = 32768
     with open(destination, "wb") as f:
@@ -429,47 +423,131 @@ def save_response_content(response, destination):
             if chunk:
                 f.write(chunk)
 def extract_file_id(drive_link: str) -> str:
     match = re.search(r"/d/([a-zA-Z0-9_-]+)", drive_link)
     if match:
         return match.group(1)
     raise ValueError("Could not extract file ID from the provided Google Drive link.")
-def load_file_from_google_drive(link: str) -> list:
     """
-    Load a document from a Google Drive link using pdfminer to extract text.
     Returns a list of LangChain Document objects.
     """
     file_id = extract_file_id(link)
-    print(f"[DEBUG] Extracted file ID: {file_id}")
     with tempfile.NamedTemporaryFile(delete=False) as temp_file:
         temp_path = temp_file.name
     try:
         download_file_from_google_drive(file_id, temp_path)
-        print(f"[DEBUG] File downloaded to: {temp_path}")
         try:
             full_text = extract_text(temp_path)
             if not full_text.strip():
                 raise ValueError("Extracted text is empty. The PDF might be image-based.")
-            print("[DEBUG] Extracted preview text from PDF:")
-            print(full_text[:1000])  # Preview first 500 characters
             document = Document(page_content=full_text, metadata={"source": link})
             return [document]
         except Exception as e:
-            print(f"[ERROR] Could not extract text from PDF: {e}")
             return []
     finally:
         if os.path.exists(temp_path):
             os.remove(temp_path)
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:
@@ -768,8 +846,6 @@ class ElevatedRagChain:
         self.elevated_rag_chain = base_runnable | prompt_runnable | self.llm | format_response
         debug_print("Elevated RAG chain successfully built and ready to use.")
     def get_current_context(self) -> str:
         base_context = "\n".join([str(doc) for doc in self.split_data[:3]]) if self.split_data else "No context available."
         history_summary = "\n\n---\n**Recent Conversations (last 3):**\n"

 from pdfminer.high_level import extract_text
 from langchain_core.documents import Document
 def get_confirm_token(response):
     for key, value in response.cookies.items():
         if key.startswith("download_warning"):
             return value
     return None
 def download_file_from_google_drive(file_id, destination):
     """
     Download a file from Google Drive handling large file confirmation.
     """
     URL = "https://docs.google.com/uc?export=download&confirm=1"
     session = requests.Session()
     response = session.get(URL, params={"id": file_id}, stream=True)
     token = get_confirm_token(response)
     if token:
         params = {"id": file_id, "confirm": token}
         response = session.get(URL, params=params, stream=True)
     save_response_content(response, destination)
 def save_response_content(response, destination):
     CHUNK_SIZE = 32768
     with open(destination, "wb") as f:
             if chunk:
                 f.write(chunk)
 def extract_file_id(drive_link: str) -> str:
+    # Check for /d/ format
     match = re.search(r"/d/([a-zA-Z0-9_-]+)", drive_link)
     if match:
         return match.group(1)
+    # Check for open?id= format
+    match = re.search(r"open\?id=([a-zA-Z0-9_-]+)", drive_link)
+    if match:
+        return match.group(1)
     raise ValueError("Could not extract file ID from the provided Google Drive link.")
+def load_txt_from_google_drive(link: str) -> Document:
+    """
+    Load text from a Google Drive shared link
+    """
+    file_id = extract_file_id(link)
+    # Create direct download link
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # Request the file content
+    response = requests.get(download_url)
+    if response.status_code != 200:
+        raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
+    # Create a Document object
+    content = response.text
+    if not content.strip():
+        raise ValueError(f"TXT file from Google Drive is empty.")
+    metadata = {"source": link}
+    return Document(page_content=content, metadata=metadata)
+def load_pdf_from_google_drive(link: str) -> list:
     """
+    Load a PDF document from a Google Drive link using pdfminer to extract text.
     Returns a list of LangChain Document objects.
     """
     file_id = extract_file_id(link)
+    debug_print(f"Extracted file ID: {file_id}")
     with tempfile.NamedTemporaryFile(delete=False) as temp_file:
         temp_path = temp_file.name
     try:
         download_file_from_google_drive(file_id, temp_path)
+        debug_print(f"File downloaded to: {temp_path}")
         try:
             full_text = extract_text(temp_path)
             if not full_text.strip():
                 raise ValueError("Extracted text is empty. The PDF might be image-based.")
+            debug_print("Extracted preview text from PDF:")
+            debug_print(full_text[:1000])  # Preview first 1000 characters
             document = Document(page_content=full_text, metadata={"source": link})
             return [document]
         except Exception as e:
+            debug_print(f"Could not extract text from PDF: {e}")
             return []
     finally:
         if os.path.exists(temp_path):
             os.remove(temp_path)
+def load_file_from_google_drive(link: str) -> list:
+    """
+    Load a document from a Google Drive link, detecting whether it's a PDF or TXT file.
+    Returns a list of LangChain Document objects.
+    """
+    file_id = extract_file_id(link)
+    # Create direct download link
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
+    # First, try to read a small portion of the file to determine its type
+    try:
+        # Use a streaming request to read just the first part of the file
+        response = requests.get(download_url, stream=True)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to download file from Google Drive. Status code: {response.status_code}")
+        # Read just the first 1024 bytes to check file signature
+        file_start = next(response.iter_content(1024))
+        response.close()  # Close the stream
+        # Convert bytes to string for pattern matching
+        file_start_str = file_start.decode('utf-8', errors='ignore')
+        # Check for PDF signature (%PDF-) at the beginning of the file
+        if file_start_str.startswith('%PDF-') or b'%PDF-' in file_start:
+            debug_print(f"Detected PDF file by content signature from Google Drive: {link}")
+            return load_pdf_from_google_drive(link)
+        else:
+            # If not a PDF, try as text
+            debug_print(f"No PDF signature found, treating as TXT file from Google Drive: {link}")
+            # Since we already downloaded part of the file, get the full content
+            response = requests.get(download_url)
+            if response.status_code != 200:
+                raise ValueError(f"Failed to download complete file from Google Drive. Status code: {response.status_code}")
+            content = response.text
+            if not content.strip():
+                raise ValueError(f"TXT file from Google Drive is empty.")
+            doc = Document(page_content=content, metadata={"source": link})
+            return [doc]
+    except UnicodeDecodeError:
+        # If we get a decode error, it's likely a binary file like PDF
+        debug_print(f"Got decode error, likely a binary file. Treating as PDF from Google Drive: {link}")
+        return load_pdf_from_google_drive(link)
+    except Exception as e:
+        debug_print(f"Error detecting file type: {e}")
+        # Fall back to trying both formats
+        debug_print("Falling back to trying both formats for Google Drive file")
+        try:
+            return load_pdf_from_google_drive(link)
+        except Exception as pdf_error:
+            debug_print(f"Failed to load as PDF: {pdf_error}")
+            try:
+                doc = load_txt_from_google_drive(link)
+                return [doc]
+            except Exception as txt_error:
+                debug_print(f"Failed to load as TXT: {txt_error}")
+                raise ValueError(f"Could not load file from Google Drive as either PDF or TXT: {link}")
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:
         self.elevated_rag_chain = base_runnable | prompt_runnable | self.llm | format_response
         debug_print("Elevated RAG chain successfully built and ready to use.")
     def get_current_context(self) -> str:
         base_context = "\n".join([str(doc) for doc in self.split_data[:3]]) if self.split_data else "No context available."
         history_summary = "\n\n---\n**Recent Conversations (last 3):**\n"