nexusbert commited on
Commit
bfdf1f3
·
verified ·
1 Parent(s): e443795

Upload 11 files

Browse files
Files changed (2) hide show
  1. app.py +1 -0
  2. rag/ingest.py +41 -14
app.py CHANGED
@@ -190,6 +190,7 @@ async def ask_question(
190
  model: str = Form(default="gemini-2.5-flash"),
191
  session_id: Optional[str] = Form(default=None),
192
  image: Optional[UploadFile] = File(default=None),
 
193
  api_key: str = Depends(verify_api_key)
194
  ):
195
  check_rate_limit(req)
 
190
  model: str = Form(default="gemini-2.5-flash"),
191
  session_id: Optional[str] = Form(default=None),
192
  image: Optional[UploadFile] = File(default=None),
193
+ document: Optional[UploadFile] = File(default=None),
194
  api_key: str = Depends(verify_api_key)
195
  ):
196
  check_rate_limit(req)
rag/ingest.py CHANGED
@@ -42,21 +42,48 @@ def get_pinecone_index(pc=None):
42
  return pc.Index(PINECONE_INDEX)
43
 
44
 
45
- def extract_text_from_pdf(pdf_path: Path) -> str:
46
- text_parts = []
47
-
48
- try:
49
- with pdfplumber.open(pdf_path) as pdf:
50
- for page_num, page in enumerate(pdf.pages, 1):
51
- page_text = page.extract_text()
52
- if page_text:
53
- text_parts.append(f"[Page {page_num}]\n{page_text}")
54
- except Exception as e:
55
- print(f" Error extracting text from {pdf_path.name}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  return ""
57
-
58
- full_text = "\n\n".join(text_parts)
59
- return clean_text(full_text)
60
 
61
 
62
  def generate_chunk_id(doc_name: str, chunk_index: int) -> str:
 
42
  return pc.Index(PINECONE_INDEX)
43
 
44
 
45
+ def extract_text_from_file(file_path: Path) -> str:
46
+ suffix = file_path.suffix.lower()
47
+
48
+ if suffix == ".pdf":
49
+ text_parts = []
50
+ try:
51
+ with pdfplumber.open(file_path) as pdf:
52
+ for page_num, page in enumerate(pdf.pages, 1):
53
+ page_text = page.extract_text()
54
+ if page_text:
55
+ text_parts.append(f"[Page {page_num}]\n{page_text}")
56
+ except Exception as e:
57
+ print(f" Error extracting text from {file_path.name}: {e}")
58
+ return ""
59
+
60
+ full_text = "\n\n".join(text_parts)
61
+ return clean_text(full_text)
62
+
63
+ elif suffix in [".doc", ".docx"]:
64
+ try:
65
+ import docx2txt
66
+ text = docx2txt.process(file_path)
67
+ return clean_text(text)
68
+ except ImportError:
69
+ print(f" docx2txt not installed. Cannot process {file_path.name}")
70
+ return ""
71
+ except Exception as e:
72
+ print(f" Error extracting text from {file_path.name}: {e}")
73
+ return ""
74
+
75
+ elif suffix == ".txt":
76
+ try:
77
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
78
+ text = f.read()
79
+ return clean_text(text)
80
+ except Exception as e:
81
+ print(f" Error reading {file_path.name}: {e}")
82
+ return ""
83
+
84
+ else:
85
+ print(f" Unsupported file type: {suffix}")
86
  return ""
 
 
 
87
 
88
 
89
  def generate_chunk_id(doc_name: str, chunk_index: int) -> str: