ShunTay12 commited on
Commit
251690c
·
1 Parent(s): 3c5dc5a

Added text image OCR

Browse files
.gitignore CHANGED
@@ -1,9 +1,17 @@
 
1
  __pycache__/
 
 
 
 
 
 
 
 
2
  .env
3
- .agent/
4
- .venv/
5
- .idea/
6
- .vscode/
7
- app/services/query_preprocessor.py
8
- .ruff_cache
9
  chunking_embedding_gguf.ipynb
 
 
 
 
 
1
+ # Python-generated files
2
  __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
  .env
12
+ .agent
 
 
 
 
 
13
  chunking_embedding_gguf.ipynb
14
+ ocr_test.py
15
+ .idea
16
+ .vscode
17
+ .gitignore
.python-version CHANGED
@@ -1 +1 @@
1
- 3.12.11
 
1
+ 3.13
app/core/config.py CHANGED
@@ -22,13 +22,19 @@ class Settings(BaseSettings):
22
  supabase_service_role_key: str
23
 
24
  # --- AI Provider ---
25
- ai_provider: Literal["openrouter", "groq"] = "groq"
26
- openrouter_api_key: str = ""
27
- groq_api_key: str = ""
 
 
 
 
 
28
 
29
  # --- Embedding Model ---
30
- model_name: str = "BAAI/bge-m3"
31
  embedding_dimensions: int = 1024
 
32
 
33
  # --- Chunking ---
34
  chunk_size: int = 512
@@ -40,27 +46,30 @@ class Settings(BaseSettings):
40
  debug: bool = True
41
 
42
  @cached_property
43
- def llm(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """Lazily initialize the LLM based on the configured provider."""
45
- if self.ai_provider == "openrouter":
46
- from langchain_openai import ChatOpenAI
47
-
48
- return ChatOpenAI(
49
- api_key=self.openrouter_api_key,
50
- model="xiaomi/mimo-v2-flash:free",
51
- base_url="https://openrouter.ai/api/v1",
52
- )
53
- else:
54
- from langchain_groq import ChatGroq
55
-
56
- return ChatGroq(
57
- api_key=self.groq_api_key,
58
- model="meta-llama/llama-4-scout-17b-16e-instruct",
59
- temperature=0,
60
- max_tokens=None,
61
- timeout=None,
62
- max_retries=2,
63
- )
64
 
65
 
66
  # Regex pattern to extract page numbers from blockquotes
 
22
  supabase_service_role_key: str
23
 
24
  # --- AI Provider ---
25
+ openrouter_api_key: str
26
+ groq_api_key: str
27
+ model_openrouter: str
28
+ base_url_openrouter: str
29
+ model_groq: str
30
+
31
+ # --- OCR Model ---
32
+ ocr_model_name: str = "zai-org/GLM-OCR"
33
 
34
  # --- Embedding Model ---
35
+ embedding_model_name: str = "BAAI/bge-m3"
36
  embedding_dimensions: int = 1024
37
+ device: str = "cuda"
38
 
39
  # --- Chunking ---
40
  chunk_size: int = 512
 
46
  debug: bool = True
47
 
48
  @cached_property
49
+ def answer_llm(self):
50
+ """Lazily initialize the LLM based on the configured provider."""
51
+ from langchain_groq import ChatGroq
52
+
53
+ return ChatGroq(
54
+ api_key=self.groq_api_key,
55
+ model=self.model_groq,
56
+ temperature=0,
57
+ max_tokens=None,
58
+ timeout=None,
59
+ max_retries=2,
60
+ )
61
+
62
+ @cached_property
63
+ def preprocess_llm(self):
64
  """Lazily initialize the LLM based on the configured provider."""
65
+ from langchain_openai import ChatOpenAI
66
+
67
+ return ChatOpenAI(
68
+ api_key=self.openrouter_api_key,
69
+ model=self.model_openrouter,
70
+ base_url=self.base_url_openrouter,
71
+ )
72
+
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  # Regex pattern to extract page numbers from blockquotes
app/core/database.py CHANGED
@@ -1,7 +1,7 @@
1
  """Supabase database operations for RAG chunk storage and vector search."""
2
 
3
  import logging
4
- from typing import Any
5
 
6
  from supabase import Client, create_client
7
 
@@ -39,7 +39,7 @@ class Database:
39
  raise RuntimeError("Database not connected. Call connect() first.")
40
  return self.admin_client if admin else self.supabase
41
 
42
- async def upsert_chunks(self, chunks: list[dict[str, Any]]) -> int:
43
  """Upsert document chunks into Supabase.
44
 
45
  Args:
@@ -79,9 +79,9 @@ class Database:
79
 
80
  async def vector_search(
81
  self,
82
- query_embedding: list[float],
83
  top_k: int = 6,
84
- ) -> list[dict[str, Any]]:
85
  """Perform vector similarity search using Supabase RPC.
86
 
87
  Args:
 
1
  """Supabase database operations for RAG chunk storage and vector search."""
2
 
3
  import logging
4
+ from typing import Any, List, Dict
5
 
6
  from supabase import Client, create_client
7
 
 
39
  raise RuntimeError("Database not connected. Call connect() first.")
40
  return self.admin_client if admin else self.supabase
41
 
42
+ async def upsert_chunks(self, chunks: List[Dict[str, Any]]) -> int:
43
  """Upsert document chunks into Supabase.
44
 
45
  Args:
 
79
 
80
  async def vector_search(
81
  self,
82
+ query_embedding: List[float],
83
  top_k: int = 6,
84
+ ) -> List[Dict[str, Any]]:
85
  """Perform vector similarity search using Supabase RPC.
86
 
87
  Args:
app/core/prompt_templates.py CHANGED
@@ -1,16 +1,104 @@
1
- """
2
- Prompt templates for the chat.
3
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 
 
6
 
7
- from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 
 
8
 
9
- general_prompt = ChatPromptTemplate.from_messages(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  [
11
  (
12
  "system",
13
- """Anda adalah Guru Pakar Sejarah Malaysia yang sangat tegas. Tugas anda adalah menjawab soalan berdasarkan maklumat tepat daripada teks yang diberikan SAHAJA.
14
 
15
  GARIS PANDUAN PENTING:
16
  1. EKSTRAK SEMUA FAKTA (KOMPREHENSIF): Baca seluruh blok konteks yang diberikan secara terperinci. Ekstrak dan senaraikan SEMUA isi penting, peristiwa, tokoh, dan tarikh yang relevan dengan soalan. Jangan ringkaskan secara berlebihan atau tinggalkan sebarang maklumat yang berpotensi menjadi markah.
@@ -18,9 +106,50 @@ general_prompt = ChatPromptTemplate.from_messages(
18
  3. JAWAPAN TIADA DALAM KONTEKS: Jika konteks yang diberikan langsung tidak mengandungi maklumat yang relevan untuk menjawab soalan, anda MESTI membalas dengan ayat ini sahaja: "Maklumat ini tidak terdapat dalam konteks yang diberikan." Jangan sesekali cuba mereka jawapan.
19
  4. PETIKAN SUMBER (WAJIB):
20
  - Setiap blok konteks bermula dengan label sumber dalam format [Source: X].
21
- - Anda MESTI meletakkan petikan sumber yang TEPAT di hujung setiap isi poin anda.
22
- - Contoh: Jika konteks bermula dengan "[Source: Buku_Teks_Sejarah_Tingkatan_5 Page 4]", tulis isi sebagai: "Asas berkerajaan sendiri. (Buku_Teks_Sejarah_Tingkatan_5 Page 4)"
23
  - Dilarang meneka nombor muka surat jika tiada dalam label.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  """,
25
  ),
26
  MessagesPlaceholder(variable_name="chat_history"),
@@ -31,7 +160,7 @@ general_prompt = ChatPromptTemplate.from_messages(
31
 
32
  Soalan: {query}
33
 
34
- Berikan skema jawapan yang KOMPREHENSIF (merangkumi semua isi yang relevan) berdasarkan konteks di atas SAHAJA. Pastikan setiap poin disokong dengan petikan sumber [Source: X] yang betul.""",
35
  ),
36
  ]
37
  )
 
1
+ from langchain_core.prompts import (
2
+ ChatPromptTemplate,
3
+ MessagesPlaceholder,
4
+ PromptTemplate,
5
+ )
6
+
7
+ # ==========================================
8
+ # 1. CLASSIFIER PROMPT (Few-Shot Optimized)
9
+ # ==========================================
10
+ classifier_prompt = PromptTemplate.from_template(
11
+ """Anda adalah penganalisis soalan peperiksaan Sejarah SPM.
12
+ Tugas anda adalah mengkategorikan soalan kepada "FAKTA" atau "KBAT" (Kemahiran Berfikir Aras Tinggi).
13
+
14
+ PANDUAN ASAS:
15
+ - FAKTA: Bertanya maklumat dari buku teks (tujuan sejarah asal, kronologi, tarikh, tokoh, peristiwa).
16
+ - KBAT: Bertanya ulasan, pendapat, rasional, atau menghubungkaitkan nilai/pengajaran kepada masyarakat atau negara secara umum.
17
+
18
+ BERIKUT ADALAH CONTOH RUJUKAN ANDA (FEW-SHOT EXAMPLES):
19
+
20
+ [CONTOH KATEGORI FAKTA]
21
+ Soalan: Pilihan raya di negara kita bermula dengan pilihan raya majlis perbandaran, pilihan raya negeri dan pilihan raya umum yang diadakan pada tahun 1955. Mengapakah pilihan raya diadakan?
22
+ Analisis: Walaupun bermula dengan petikan panjang, ia bertanya tujuan sejarah (mengapa pilihan raya diadakan pada waktu tersebut).
23
+ Jawapan: FAKTA
24
+
25
+ Soalan: Jelaskan perkembangan Pilihan Raya Majlis Perbandaran.
26
+ Analisis: Walaupun menggunakan perkataan "Jelaskan", ia diikuti dengan "perkembangan", bermaksud meminta kronologi peristiwa sejarah sebenar.
27
+ Jawapan: FAKTA
28
+
29
+ Soalan: Bagaimanakah Jawatankuasa Setia Kawan Malaysia (JSKM) dapat menjayakan pembentukan Malaysia?
30
+ Analisis: Bertanya tentang proses dan tindakan spesifik sebuah jawatankuasa sejarah.
31
+ Jawapan: FAKTA
32
+
33
+ [CONTOH KATEGORI KBAT]
34
+ Soalan: Pembentukan Malaysia penting bagi menjamin kestabilan dan kemakmuran negara. Jelaskan.
35
+ Analisis: "Jelaskan" di sini merujuk kepada konsep umum (kestabilan dan kemakmuran), bukan kronologi peristiwa.
36
+ Jawapan: KBAT
37
+
38
+ Soalan: Mengapakah undang-undang penting kepada sesebuah negara?
39
+ Analisis: Bertanya soalan universal/umum ("sesebuah negara"), bukan peristiwa spesifik Tanah Melayu.
40
+ Jawapan: KBAT
41
+
42
+ Soalan: Kemerdekaan negara amat unik kerana ia dicapai melalui meja rundingan tanpa melibatkan pertumpahan darah. Berdasarkan pernyataan tersebut, beri ulasan anda.
43
+ Analisis: Meminta "ulasan" berdasarkan pernyataan nilai sejarah.
44
+ Jawapan: KBAT
45
+
46
+ TUGAS ANDA:
47
+ Berdasarkan contoh-contoh di atas, kategorikan soalan berikut dan berikan analisis anda.
48
+
49
+ PENTING: Anda MESTI mengeluarkan jawapan dalam format JSON yang sah dengan dua kunci berikut:
50
+ 1. "analisis": (Penerangan ringkas mengapa soalan itu FAKTA atau KBAT)
51
+ 2. "kategori": (HANYA perkataan "FAKTA" atau "KBAT")
52
+
53
+ Soalan: {query}
54
+ Output JSON:"""
55
+ )
56
+
57
+ # ==========================================
58
+ # 2. QUERY REWRITER PROMPT (The Search Optimizer)
59
+ # ==========================================
60
+ # Purpose: Strip conversational fluff and isolate core historical entities for vector retrieval.
61
+ query_prompt = PromptTemplate.from_template(
62
+ """Anda adalah pakar carian pangkalan data vektor untuk Sejarah Malaysia.
63
+ Tugas anda adalah menulis semula soalan pengguna supaya menjadi kata kunci carian yang optimum.
64
+
65
+ GARIS PANDUAN:
66
+ 1. Buang perkataan soal (seperti apakah, siapakah, jelaskan, huraikan).
67
+ 2. Kekalkan HANYA entiti sejarah utama, nama tokoh, lokasi, atau nama peristiwa.
68
+ 3. Tambah sinonim atau terma rasmi yang relevan jika perlu untuk meluaskan carian sejarah.
69
+
70
+ CONTOH 1:
71
+ Soalan Asal: "Perjanjian Malaysia yang melibatkan British, Persekutuan Tanah Melayu, Sarawak, Sabah dan Singapura telah ditandatangani pada 9 Julai 1963. Jelaskan inti pati Perjanjian Malaysia."
72
+ Carian Optimum: "Perjanjian Malaysia 9 Julai 1963"
73
 
74
+ CONTOH 2:
75
+ Soalan Asal: "Nyatakan ciri-ciri negara bangsa Kesultanan Melayu Melaka."
76
+ Carian Optimum: "Negara bangsa Kesultanan Melayu Melaka"
77
 
78
+ CONTOH 3:
79
+ Soalan Asal: "Apakah yang anda faham dengan konsep kedaulatan?"
80
+ Carian Optimum: "Konsep kedaulatan"
81
 
82
+ CONTOH 4:
83
+ Soalan Asal: "Bagaimanakah Jawatankuasa Setia Kawan Malaysia (JSKM) dapat menjayakan pembentukan Malaysia?"
84
+ Carian Optimum: "Jawatankuasa Setia Kawan Malaysia (JSKM)"
85
+
86
+ Tulis semula soalan berikut untuk carian pangkalan data. Berikan HANYA kata kunci carian tersebut.
87
+
88
+ Soalan Asal: {query}
89
+ Carian Optimum:"""
90
+ )
91
+
92
+
93
+ # ==========================================
94
+ # 3. ANSWER PROMPT (The Strict Examiner)
95
+ # ==========================================
96
+ # Purpose: Answer the query using ONLY the retrieved context, formatted as a marking scheme.
97
+ answer_prompt = ChatPromptTemplate.from_messages(
98
  [
99
  (
100
  "system",
101
+ """Anda adalah Guru Pakar Sejarah Malaysia dan pemeriksa kertas peperiksaan yang sangat tegas. Tugas anda adalah menjawab soalan berdasarkan maklumat tepat daripada teks yang diberikan SAHAJA.
102
 
103
  GARIS PANDUAN PENTING:
104
  1. EKSTRAK SEMUA FAKTA (KOMPREHENSIF): Baca seluruh blok konteks yang diberikan secara terperinci. Ekstrak dan senaraikan SEMUA isi penting, peristiwa, tokoh, dan tarikh yang relevan dengan soalan. Jangan ringkaskan secara berlebihan atau tinggalkan sebarang maklumat yang berpotensi menjadi markah.
 
106
  3. JAWAPAN TIADA DALAM KONTEKS: Jika konteks yang diberikan langsung tidak mengandungi maklumat yang relevan untuk menjawab soalan, anda MESTI membalas dengan ayat ini sahaja: "Maklumat ini tidak terdapat dalam konteks yang diberikan." Jangan sesekali cuba mereka jawapan.
107
  4. PETIKAN SUMBER (WAJIB):
108
  - Setiap blok konteks bermula dengan label sumber dalam format [Source: X].
109
+ - Anda MESTI meletakkan petikan sumber yang TEPAT di paling hujung isi poin anda sekali sahaja.
 
110
  - Dilarang meneka nombor muka surat jika tiada dalam label.
111
+ 5. FORMAT SKEMA PEMARKAHAN: Susun jawapan anda dalam bentuk "bullet points" yang ringkas, padat, dan mudah dibaca (Bahasa Melayu). Elakkan perenggan yang meleret.
112
+
113
+ CONTOH FORMAT JAWAPAN (FEW-SHOT EXAMPLES):
114
+ Berikut adalah contoh anda menstrukturkan soalan dan jawapan. Terdapat dua jenis soalan, iaitu soalan tunggal atau soalan yang mempunyai beberapa bahagian (mengandaikan sumber konteks ialah [Source: Teks_Rujukan Page x]):
115
+
116
+ CONTOH 1:
117
+ Pilihan raya di negara kita bermula dengan pilihan raya majlis perbandaran, pilihan raya negeri dan pilihan raya umum yang diadakan pada tahun 1955. Mengapakah pilihan raya diadakan?
118
+ - Asas berkerajaan sendiri
119
+ - Memilih wakil
120
+ - Membentuk kerajaan
121
+ - Pengamalan sistem demokrasi
122
+ [Source: Teks_Rujukan Page x]
123
+
124
+ CONTOH 2:
125
+ Jelaskan perkembangan Pilihan Raya Majlis Perbandaran.
126
+ - Pilihan raya Perbandaran George Town, Pulau Pinang/diadakan pada tahun 1951
127
+ - Untuk memilih sembilan orang ahli Pesuruhjaya Perbandaran George Town
128
+ - Ahli dalam Majlis Perbandaran dilantik oleh Gabenor Negeri-negeri Selat
129
+ - Hari pendaftaran pemilih diadakan selama enam minggu/mulai 17 Mei 1951 hingga 30 Jun 1951
130
+ - Seramai 14 514 orang pengundi mendaftar untuk membuang undi
131
+ - Pilihan Raya Majlis Perbandaran Kuala Lumpur yang diadakan pada Februari 1952
132
+ - Merebut 12 kerusi Majlis Perbandaran Kuala Lumpur bagi kawasan Sentul/Bangsar/Imbi/Petaling Jaya
133
+ [Source: Teks_Rujukan Page x]
134
+
135
+ CONTOH 3:
136
+ 1. Tunku Abdul Rahman Putra al-Haj mengumumkan pembentukan Malaysia pada 27 Mei 1961.
137
+ (a) Namakan dua wilayah yang terlibat dalam cadangan pembentukan tersebut. [2 markah]
138
+ - Persekutuan Tanah Melayu
139
+ - Sarawak
140
+ - Sabah
141
+ - Singapura
142
+ - Brunei
143
+ [Source: Teks_Rujukan Page x]
144
+ (b) Bagaimanakah Jawatankuasa Setia Kawan Malaysia (JSKM) dapat menjayakan pembentukan Malaysia? [4 markah]
145
+ - Mengadakan mesyuarat sebanyak empat kali
146
+ - Jesselton/Kuching/ Kuala Lumpur/ Singapura
147
+ - Menyakinkan penduduk Sarawak Sabah tentang pembentukan Malaysia
148
+ - Menyediakan ruang perbincangan bagi mendapatkan kata sepakat terhadap pembentukan Malaysia
149
+ - Mengumpulkan pandangan tentang pembentukan Malaysia
150
+ - Menyebarkan maklumat berkaitan dengan Malaysia
151
+ - Memupuk aktiviti penghebahan/ mempercepat pembentukan Malaysia
152
+ [Source: Teks_Rujukan Page x]
153
  """,
154
  ),
155
  MessagesPlaceholder(variable_name="chat_history"),
 
160
 
161
  Soalan: {query}
162
 
163
+ Berikan skema jawapan yang KOMPREHENSIF (merangkumi semua isi yang relevan) berdasarkan konteks di atas SAHAJA. Pastikan letak petikan sumber [Source: X] yang betul di paling hujung isi poin anda sekali sahaja.""",
164
  ),
165
  ]
166
  )
app/schemas/rag.py CHANGED
@@ -39,7 +39,7 @@ class AnswerRequest(BaseModel):
39
  class DebugInfo(BaseModel):
40
  """Debug information included in the answer response."""
41
 
42
- top_doc_ids: list[str] = Field(
43
  default_factory=list,
44
  description="IDs of the top chunks used as context.",
45
  )
@@ -57,7 +57,7 @@ class AnswerResponse(BaseModel):
57
  ...,
58
  description="Generated answer text based on retrieved context.",
59
  )
60
- citations: list[str] = Field(
61
  default_factory=list,
62
  description="List of source citations used in the answer.",
63
  )
 
39
  class DebugInfo(BaseModel):
40
  """Debug information included in the answer response."""
41
 
42
+ top_doc_ids: List[str] = Field(
43
  default_factory=list,
44
  description="IDs of the top chunks used as context.",
45
  )
 
57
  ...,
58
  description="Generated answer text based on retrieved context.",
59
  )
60
+ citations: List[str] = Field(
61
  default_factory=list,
62
  description="List of source citations used in the answer.",
63
  )
app/services/chat.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import asyncio
4
  import logging
5
- from typing import Any, AsyncGenerator
6
 
7
  from langchain_core.prompts import ChatPromptTemplate
8
  from langchain_core.runnables import Runnable
@@ -17,13 +17,13 @@ class ChatService:
17
 
18
  def __init__(self) -> None:
19
  """Initialize chat service with the configured LLM."""
20
- self.model = settings.llm
21
- logger.info(f"Initialized chat service with provider: {settings.ai_provider}")
22
 
23
  async def generate_answer(
24
  self,
25
  query: str,
26
- context_blocks: list[dict[str, Any]],
27
  prompt: ChatPromptTemplate,
28
  ) -> str:
29
  """Generate a RAG answer using retrieved context blocks.
@@ -49,9 +49,9 @@ class ChatService:
49
  logger.error(f"Error generating answer: {e}")
50
  return "Sorry, I was unable to generate an answer."
51
 
52
- def _build_context_string(self, context_blocks: list[dict[str, Any]]) -> str:
53
  """Helper to build context string with citations."""
54
- context_parts: list[str] = []
55
  for block in context_blocks:
56
  chunk_id = block.get("chunk_id", "unknown")
57
  source = block.get("source", "")
 
2
 
3
  import asyncio
4
  import logging
5
+ from typing import Any, AsyncGenerator, List, Dict
6
 
7
  from langchain_core.prompts import ChatPromptTemplate
8
  from langchain_core.runnables import Runnable
 
17
 
18
  def __init__(self) -> None:
19
  """Initialize chat service with the configured LLM."""
20
+ self.model = settings.answer_llm
21
+ logger.info(f"Initialized chat service with provider: {self.model}")
22
 
23
  async def generate_answer(
24
  self,
25
  query: str,
26
+ context_blocks: List[Dict[str, Any]],
27
  prompt: ChatPromptTemplate,
28
  ) -> str:
29
  """Generate a RAG answer using retrieved context blocks.
 
49
  logger.error(f"Error generating answer: {e}")
50
  return "Sorry, I was unable to generate an answer."
51
 
52
+ def _build_context_string(self, context_blocks: List[Dict[str, Any]]) -> str:
53
  """Helper to build context string with citations."""
54
+ context_parts: List[str] = []
55
  for block in context_blocks:
56
  chunk_id = block.get("chunk_id", "unknown")
57
  source = block.get("source", "")
app/services/chunker.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  import logging
4
  import re
5
- from typing import Any
6
 
7
  from langchain_text_splitters import (
8
  MarkdownHeaderTextSplitter,
@@ -70,7 +70,7 @@ class TextChunker:
70
  return f"Page {match.group(1)}"
71
  return None
72
 
73
- def _extract_page_sections(self, text: str) -> list[tuple[str | None, str]]:
74
  """Split text into sections by page markers and extract page numbers.
75
 
76
  Args:
@@ -84,7 +84,7 @@ class TextChunker:
84
  if not page_markers:
85
  return [(None, text)]
86
 
87
- sections: list[tuple[str | None, str]] = []
88
  for i, match in enumerate(page_markers):
89
  page_num = f"Page {match.group(1)}"
90
  start = match.end()
@@ -101,7 +101,7 @@ class TextChunker:
101
  """Remove page source markers from text content."""
102
  return PAGE_SOURCE_PATTERN.sub("", text).strip()
103
 
104
- def _build_page_index(self, text: str) -> list[tuple[int, str]]:
105
  """Build an index of (position, page_number) from page markers.
106
 
107
  Args:
@@ -120,7 +120,7 @@ class TextChunker:
120
  self,
121
  content: str,
122
  original_text: str,
123
- page_index: list[tuple[int, str]],
124
  fallback: str,
125
  ) -> str:
126
  """Find the page number for a chunk by locating its position in original text.
@@ -161,7 +161,7 @@ class TextChunker:
161
 
162
  def chunk_text(
163
  self, text: str, source: str, base_chunk_id: str
164
- ) -> list[dict[str, Any]]:
165
  """Split text into overlapping chunks with page number extraction.
166
 
167
  Args:
@@ -187,7 +187,7 @@ class TextChunker:
187
  # Second pass: Split large sections by character limit
188
  split_docs = self.text_splitter.split_documents(header_docs)
189
 
190
- chunks: list[dict[str, Any]] = []
191
  chunk_num = 1
192
  last_page = source # Track last seen page for continuity
193
 
@@ -246,7 +246,7 @@ class TextChunker:
246
 
247
  return text
248
 
249
- def _attach_header_context(self, metadata: dict[str, Any], content: str) -> str:
250
  """Prefix chunk content with header context from markdown metadata.
251
 
252
  Args:
@@ -256,7 +256,7 @@ class TextChunker:
256
  Returns:
257
  Content prefixed with header context.
258
  """
259
- header_lines: list[str] = []
260
  for level in range(1, 5):
261
  key = f"Header {level}"
262
  if metadata.get(key):
@@ -268,7 +268,7 @@ class TextChunker:
268
 
269
  return "\n\n".join(["\n".join(header_lines), content])
270
 
271
- def chunk_documents(self, documents: list[dict[str, str]]) -> list[dict[str, Any]]:
272
  """Chunk multiple documents.
273
 
274
  Args:
@@ -277,7 +277,7 @@ class TextChunker:
277
  Returns:
278
  List of all chunks from all documents.
279
  """
280
- all_chunks: list[dict[str, Any]] = []
281
 
282
  for doc in documents:
283
  text = doc.get("text", "")
 
2
 
3
  import logging
4
  import re
5
+ from typing import Any, List, Dict
6
 
7
  from langchain_text_splitters import (
8
  MarkdownHeaderTextSplitter,
 
70
  return f"Page {match.group(1)}"
71
  return None
72
 
73
+ def _extract_page_sections(self, text: str) -> List[tuple[str | None, str]]:
74
  """Split text into sections by page markers and extract page numbers.
75
 
76
  Args:
 
84
  if not page_markers:
85
  return [(None, text)]
86
 
87
+ sections: List[tuple[str | None, str]] = []
88
  for i, match in enumerate(page_markers):
89
  page_num = f"Page {match.group(1)}"
90
  start = match.end()
 
101
  """Remove page source markers from text content."""
102
  return PAGE_SOURCE_PATTERN.sub("", text).strip()
103
 
104
+ def _build_page_index(self, text: str) -> List[tuple[int, str]]:
105
  """Build an index of (position, page_number) from page markers.
106
 
107
  Args:
 
120
  self,
121
  content: str,
122
  original_text: str,
123
+ page_index: List[tuple[int, str]],
124
  fallback: str,
125
  ) -> str:
126
  """Find the page number for a chunk by locating its position in original text.
 
161
 
162
  def chunk_text(
163
  self, text: str, source: str, base_chunk_id: str
164
+ ) -> List[Dict[str, Any]]:
165
  """Split text into overlapping chunks with page number extraction.
166
 
167
  Args:
 
187
  # Second pass: Split large sections by character limit
188
  split_docs = self.text_splitter.split_documents(header_docs)
189
 
190
+ chunks: List[Dict[str, Any]] = []
191
  chunk_num = 1
192
  last_page = source # Track last seen page for continuity
193
 
 
246
 
247
  return text
248
 
249
+ def _attach_header_context(self, metadata: Dict[str, Any], content: str) -> str:
250
  """Prefix chunk content with header context from markdown metadata.
251
 
252
  Args:
 
256
  Returns:
257
  Content prefixed with header context.
258
  """
259
+ header_lines: List[str] = []
260
  for level in range(1, 5):
261
  key = f"Header {level}"
262
  if metadata.get(key):
 
268
 
269
  return "\n\n".join(["\n".join(header_lines), content])
270
 
271
+ def chunk_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]:
272
  """Chunk multiple documents.
273
 
274
  Args:
 
277
  Returns:
278
  List of all chunks from all documents.
279
  """
280
+ all_chunks: List[Dict[str, Any]] = []
281
 
282
  for doc in documents:
283
  text = doc.get("text", "")
app/services/embedding.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import asyncio
4
  import logging
 
5
 
6
  from app.core.config import settings
7
 
@@ -11,20 +12,18 @@ logger = logging.getLogger(__name__)
11
  class EmbeddingService:
12
  """Service for generating text embeddings using SentenceTransformers."""
13
 
14
- def __init__(
15
- self,
16
- model_name: str = "",
17
- device: str = "cuda",
18
- max_length: int = 8192,
19
- ) -> None:
20
  import torch
21
  from sentence_transformers import SentenceTransformer
22
 
23
- _model_name = model_name or settings.model_name
 
24
 
25
  # Auto-detect device
26
- if device == "cuda" and not torch.cuda.is_available():
27
- device = "cpu"
 
 
28
  logger.warning("CUDA not available, falling back to CPU")
29
 
30
  logger.info(f"Loading embedding model: {_model_name}")
@@ -33,11 +32,9 @@ class EmbeddingService:
33
  self.model = SentenceTransformer(
34
  _model_name,
35
  trust_remote_code=True,
36
- device=device,
37
- model_kwargs={"torch_dtype": torch.float16} if device == "cuda" else {},
38
  )
39
- self.model.max_seq_length = max_length
40
- self.device = device
41
 
42
  self.embedding_dim = self.model.get_sentence_embedding_dimension()
43
  logger.info(
@@ -45,8 +42,8 @@ class EmbeddingService:
45
  )
46
 
47
  async def embed_texts(
48
- self, texts: list[str], batch_size: int = 8
49
- ) -> list[list[float]]:
50
  """Generate embeddings for a list of document texts.
51
 
52
  Args:
@@ -78,7 +75,7 @@ class EmbeddingService:
78
  logger.error(f"Failed to generate embeddings: {e}")
79
  raise
80
 
81
- async def embed_query(self, query: str) -> list[float]:
82
  """Generate embedding for a single query.
83
 
84
  Args:
 
2
 
3
  import asyncio
4
  import logging
5
+ from typing import List
6
 
7
  from app.core.config import settings
8
 
 
12
  class EmbeddingService:
13
  """Service for generating text embeddings using SentenceTransformers."""
14
 
15
+ def __init__(self):
 
 
 
 
 
16
  import torch
17
  from sentence_transformers import SentenceTransformer
18
 
19
+ _model_name = settings.embedding_model_name
20
+ _device = settings.device
21
 
22
  # Auto-detect device
23
+ if torch.cuda.is_available():
24
+ _device = "cuda"
25
+ else:
26
+ _device = "cpu"
27
  logger.warning("CUDA not available, falling back to CPU")
28
 
29
  logger.info(f"Loading embedding model: {_model_name}")
 
32
  self.model = SentenceTransformer(
33
  _model_name,
34
  trust_remote_code=True,
35
+ device=_device,
36
+ model_kwargs={"torch_dtype": torch.float16} if _device == "cuda" else {},
37
  )
 
 
38
 
39
  self.embedding_dim = self.model.get_sentence_embedding_dimension()
40
  logger.info(
 
42
  )
43
 
44
  async def embed_texts(
45
+ self, texts: List[str], batch_size: int = 8
46
+ ) -> List[List[float]]:
47
  """Generate embeddings for a list of document texts.
48
 
49
  Args:
 
75
  logger.error(f"Failed to generate embeddings: {e}")
76
  raise
77
 
78
+ async def embed_query(self, query: str) -> List[float]:
79
  """Generate embedding for a single query.
80
 
81
  Args:
app/services/ocr.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OCR service to extract text from image."""
2
+
3
+ from PIL import Image
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class OCRService:
10
+ """OCR service to extract text from image."""
11
+
12
+ def __init__(self):
13
+ import torch
14
+ from transformers import AutoProcessor, AutoModelForImageTextToText
15
+
16
+ _model_name = "zai-org/GLM-OCR"
17
+ _device = "cuda"
18
+
19
+ # Auto-detect device
20
+ if torch.cuda.is_available():
21
+ _device = "cuda"
22
+ else:
23
+ _device = "cpu"
24
+ logger.warning("CUDA not available, falling back to CPU")
25
+
26
+ logger.info(f"Loading OCR model: {_model_name}")
27
+
28
+ self.processor = AutoProcessor.from_pretrained(_model_name)
29
+ self.model = AutoModelForImageTextToText.from_pretrained(
30
+ pretrained_model_name_or_path=_model_name,
31
+ torch_dtype="auto",
32
+ device_map="auto",
33
+ )
34
+
35
+ async def extract_text(self, pil_image: Image.Image) -> str:
36
+ """Extract text from image."""
37
+
38
+ img = pil_image.convert("RGB")
39
+
40
+ message = [
41
+ {
42
+ "role": "user",
43
+ "content": [
44
+ {"type": "image", "image": img},
45
+ {"type": "text", "text": "Text Recognition:"},
46
+ ],
47
+ }
48
+ ]
49
+
50
+ inputs = self.processor.apply_chat_template(
51
+ message,
52
+ tokenize=True,
53
+ add_generation_prompt=True,
54
+ return_dict=True,
55
+ return_tensors="pt",
56
+ ).to(self.model.device)
57
+
58
+ inputs.pop("token_type_ids", None)
59
+ generated_ids = self.model.generate(**inputs, max_new_tokens=8192)
60
+ output_text = self.processor.decode(
61
+ generated_ids[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
62
+ )
63
+ return output_text
64
+
65
+
66
+ ocr_service = OCRService()
app/services/query_preprocessor.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Query preprocessing: classification and query rewriting."""
2
+
3
+ import asyncio
4
+ import logging
5
+
6
+ from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
7
+
8
+ from app.core.config import settings
9
+ from app.core.prompt_templates import classifier_prompt, query_prompt
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class QueryPreprocessor:
15
+ """Handles query classification and rewriting as two separate chains."""
16
+
17
+ def __init__(self) -> None:
18
+ self.llm = settings.preprocess_llm
19
+ # Chain 1: Classify → returns "FAKTA" or "KBAT"
20
+ self.classifier_chain = classifier_prompt | self.llm | JsonOutputParser()
21
+ # Chain 2: Rewrite → returns optimized search query
22
+ self.rewriter_chain = query_prompt | self.llm | StrOutputParser()
23
+
24
+ async def classify(self, query: str) -> tuple[str, bool]:
25
+ """Classify a query as FAKTA or KBAT.
26
+
27
+ Returns:
28
+ A tuple of (classification, needs_rag).
29
+ - classification: "FAKTA" or "KBAT"
30
+ - needs_rag: True if FAKTA (proceed with RAG), False if KBAT (stop).
31
+ """
32
+ result = await asyncio.to_thread(self.classifier_chain.invoke, {"query": query})
33
+ classification = result["kategori"].strip().upper()
34
+ needs_rag = classification == "FAKTA"
35
+ logger.info(f"Query classified as: {classification} | needs_rag: {needs_rag} | Analysis: {result['analisis']}")
36
+ return classification, needs_rag
37
+
38
+ async def rewrite(self, query: str) -> str:
39
+ """Rewrite a query to optimize for vector search.
40
+
41
+ Returns:
42
+ Optimized search query string.
43
+ """
44
+ result = await asyncio.to_thread(self.rewriter_chain.invoke, {"query": query})
45
+ logger.info(f"Rewritten query: {result.strip()}")
46
+ return result.strip()
47
+
48
+
49
+ query_preprocessor = QueryPreprocessor()
app/services/rag.py CHANGED
@@ -5,14 +5,19 @@ Orchestrates the complete RAG pipeline: chunk → embed → search → generate.
5
 
6
  import asyncio
7
  import logging
 
8
  import time
9
- from typing import Any, AsyncGenerator
 
 
10
 
11
  from app.core.database import db
12
- from app.core.prompt_templates import general_prompt
13
  from app.services.chat import chat_service
14
  from app.services.chunker import chunker
15
  from app.services.embedding import embedding_service
 
 
16
 
17
  logger = logging.getLogger(__name__)
18
 
@@ -84,7 +89,12 @@ class RAGService:
84
  logger.error(f"Seeding failed: {e}")
85
  raise
86
 
87
- async def answer_question(self, query: str, top_k: int = 6) -> dict[str, Any]:
 
 
 
 
 
88
  """Process a query through the complete RAG pipeline.
89
 
90
  Args:
@@ -97,8 +107,31 @@ class RAGService:
97
  start_time = time.time()
98
 
99
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # Step 1: Generate query embedding
101
- query_embedding = await self.embedding_service.embed_query(query)
102
 
103
  # Step 2: Vector similarity search
104
  search_results = await self.db.vector_search(query_embedding, top_k)
@@ -118,11 +151,11 @@ class RAGService:
118
 
119
  # Step 3: Deduplicate and prepare context
120
  context_blocks = self._prepare_context(search_results)
121
- logger.debug(f"Context blocks: {context_blocks}")
122
 
123
  # Step 4: Generate answer
124
  answer_text = await self.chat_service.generate_answer(
125
- query, context_blocks, general_prompt
126
  )
127
  logger.debug(f"Answer text: {answer_text}")
128
 
@@ -157,8 +190,8 @@ class RAGService:
157
  }
158
 
159
  def _prepare_context(
160
- self, search_results: list[dict[str, Any]]
161
- ) -> list[dict[str, Any]]:
162
  """Prepare context blocks from search results with deduplication.
163
 
164
  Args:
@@ -168,7 +201,7 @@ class RAGService:
168
  Processed, deduplicated context blocks.
169
  """
170
  seen_prefixes: set[str] = set()
171
- context_blocks: list[dict[str, Any]] = []
172
 
173
  for result in search_results:
174
  chunk_id = result.get("chunk_id", "")
@@ -190,18 +223,33 @@ class RAGService:
190
  def _extract_citations(
191
  self,
192
  answer_text: str,
193
- context_blocks: list[dict[str, Any]],
194
- ) -> list[str]:
195
- """Return citations from the context blocks that were used.
 
 
 
 
196
 
197
  Args:
198
- answer_text: Generated answer text (kept for future use).
199
- context_blocks: Context blocks that were provided.
200
 
201
  Returns:
202
- List of formatted citations combining base chunk_id and source.
203
  """
204
- citations: list[str] = []
 
 
 
 
 
 
 
 
 
 
 
205
  for block in context_blocks:
206
  chunk_id = block.get("chunk_id", "")
207
  source = block.get("source", "")
@@ -209,10 +257,18 @@ class RAGService:
209
  # Extract base chunk_id (remove the #N suffix)
210
  base_id = chunk_id.split("#")[0] if "#" in chunk_id else chunk_id
211
 
212
- # Combine base_id and source
213
  citation = f"{base_id} {source}".strip()
214
 
215
- if citation and citation not in citations:
 
 
 
 
 
 
 
 
216
  citations.append(citation)
217
 
218
  return citations
 
5
 
6
  import asyncio
7
  import logging
8
+ import re
9
  import time
10
+ from typing import Any, AsyncGenerator, Optional, List, Dict
11
+ from fastapi import UploadFile
12
+ from PIL import Image
13
 
14
  from app.core.database import db
15
+ from app.core.prompt_templates import answer_prompt
16
  from app.services.chat import chat_service
17
  from app.services.chunker import chunker
18
  from app.services.embedding import embedding_service
19
+ from app.services.ocr import ocr_service
20
+ from app.services.query_preprocessor import query_preprocessor
21
 
22
  logger = logging.getLogger(__name__)
23
 
 
89
  logger.error(f"Seeding failed: {e}")
90
  raise
91
 
92
+ async def answer_question(
93
+ self,
94
+ query: Optional[str] = "",
95
+ top_k: int = 6,
96
+ image: Optional[UploadFile] = None,
97
+ ) -> Dict[str, Any]:
98
  """Process a query through the complete RAG pipeline.
99
 
100
  Args:
 
107
  start_time = time.time()
108
 
109
  try:
110
+ if image:
111
+ pil_image = Image.open(image.file)
112
+ content = await ocr_service.extract_text(pil_image)
113
+ query = content
114
+ logger.info(f"Extracted text from image: {content}")
115
+
116
+ query_classification, needs_rag = await query_preprocessor.classify(query)
117
+
118
+ if not needs_rag:
119
+ return {
120
+ "text": (
121
+ "Soalan ini adalah KBAT. Saya TIDAK dapat menjawab soalan KBAT. "
122
+ "Sila tanya soalan FAKTA."
123
+ ),
124
+ "citations": [],
125
+ "debug": {
126
+ "top_doc_ids": [],
127
+ "latency_ms": int((time.time() - start_time) * 1000),
128
+ },
129
+ }
130
+
131
+ rewritten_query = await query_preprocessor.rewrite(query)
132
+
133
  # Step 1: Generate query embedding
134
+ query_embedding = await self.embedding_service.embed_query(rewritten_query)
135
 
136
  # Step 2: Vector similarity search
137
  search_results = await self.db.vector_search(query_embedding, top_k)
 
151
 
152
  # Step 3: Deduplicate and prepare context
153
  context_blocks = self._prepare_context(search_results)
154
+ logger.info(f"Context blocks: {context_blocks}")
155
 
156
  # Step 4: Generate answer
157
  answer_text = await self.chat_service.generate_answer(
158
+ query, context_blocks, answer_prompt
159
  )
160
  logger.debug(f"Answer text: {answer_text}")
161
 
 
190
  }
191
 
192
  def _prepare_context(
193
+ self, search_results: List[Dict[str, Any]]
194
+ ) -> List[Dict[str, Any]]:
195
  """Prepare context blocks from search results with deduplication.
196
 
197
  Args:
 
201
  Processed, deduplicated context blocks.
202
  """
203
  seen_prefixes: set[str] = set()
204
+ context_blocks: List[Dict[str, Any]] = []
205
 
206
  for result in search_results:
207
  chunk_id = result.get("chunk_id", "")
 
223
  def _extract_citations(
224
  self,
225
  answer_text: str,
226
+ context_blocks: List[Dict[str, Any]],
227
+ ) -> List[str]:
228
+ """Return only citations that the LLM actually referenced in its answer.
229
+
230
+ Parses [Source: X] tags from the answer text, then matches them
231
+ against the available context blocks. Only citations that appear
232
+ in the answer are returned.
233
 
234
  Args:
235
+ answer_text: Generated answer text containing [Source: ...] tags.
236
+ context_blocks: Context blocks that were provided to the LLM.
237
 
238
  Returns:
239
+ List of formatted citations that were actually referenced.
240
  """
241
+ # Step 1: Extract all [Source: ...] references from the answer text
242
+ cited_sources_raw = re.findall(r"\[Source:\s*(.+?)\]", answer_text)
243
+
244
+ if not cited_sources_raw:
245
+ logger.debug("No [Source: ...] tags found in answer text")
246
+ return []
247
+
248
+ # Normalize cited sources for fuzzy matching
249
+ cited_sources_normalized = {s.strip().lower() for s in cited_sources_raw}
250
+
251
+ # Step 2: Build available citations from context blocks
252
+ citations = []
253
  for block in context_blocks:
254
  chunk_id = block.get("chunk_id", "")
255
  source = block.get("source", "")
 
257
  # Extract base chunk_id (remove the #N suffix)
258
  base_id = chunk_id.split("#")[0] if "#" in chunk_id else chunk_id
259
 
260
+ # This matches the format used in chat.py _build_context_string()
261
  citation = f"{base_id} {source}".strip()
262
 
263
+ if not citation or citation in citations:
264
+ continue
265
+
266
+ # Check if this citation was actually referenced in the answer
267
+ citation_lower = citation.lower()
268
+ if any(
269
+ citation_lower in cs or cs in citation_lower
270
+ for cs in cited_sources_normalized
271
+ ):
272
  citations.append(citation)
273
 
274
  return citations
main.py CHANGED
@@ -5,9 +5,10 @@ import logging
5
  import time
6
  from collections.abc import AsyncGenerator
7
  from contextlib import asynccontextmanager
 
8
 
9
  import uvicorn
10
- from fastapi import FastAPI, HTTPException, UploadFile
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.responses import JSONResponse
13
 
@@ -156,7 +157,13 @@ async def seed_documents(file: UploadFile) -> SeedResponse:
156
  500: {"model": ErrorResponse, "description": "Processing failed"},
157
  },
158
  )
159
- async def answer_question(request: AnswerRequest) -> AnswerResponse:
 
 
 
 
 
 
160
  """Answer a question using Retrieval-Augmented Generation.
161
 
162
  Pipeline:
@@ -166,10 +173,10 @@ async def answer_question(request: AnswerRequest) -> AnswerResponse:
166
  4. Return answer with citations and debug info
167
  """
168
  try:
169
- logger.info(f"Processing query: '{request.query[:100]}...'")
170
 
171
  result = await rag_service.answer_question(
172
- query=request.query, top_k=request.top_k
173
  )
174
 
175
  response = AnswerResponse(
 
5
  import time
6
  from collections.abc import AsyncGenerator
7
  from contextlib import asynccontextmanager
8
+ from typing import Optional
9
 
10
  import uvicorn
11
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
12
  from fastapi.middleware.cors import CORSMiddleware
13
  from fastapi.responses import JSONResponse
14
 
 
157
  500: {"model": ErrorResponse, "description": "Processing failed"},
158
  },
159
  )
160
+ async def answer_question(
161
+ query: Optional[str] = Form(default="", description="The question to answer."),
162
+ top_k: int = Form(default=6, description="Number of top-k chunks to retrieve."),
163
+ image: Optional[UploadFile] = File(
164
+ default=None, description="Optional image upload."
165
+ ),
166
+ ) -> AnswerResponse:
167
  """Answer a question using Retrieval-Augmented Generation.
168
 
169
  Pipeline:
 
173
  4. Return answer with citations and debug info
174
  """
175
  try:
176
+ logger.info(f"Processing query: '{query[:100] if query else 'No query'}...'")
177
 
178
  result = await rag_service.answer_question(
179
+ query=query, top_k=top_k, image=image
180
  )
181
 
182
  response = AnswerResponse(
pyproject.toml CHANGED
@@ -1,66 +1,32 @@
1
  [project]
2
- name = "sejarah-rag"
3
  version = "0.1.0"
4
- description = "RAG backend for Malaysian History (Sejarah) education"
5
  readme = "README.md"
6
- requires-python = ">=3.12.11"
7
  dependencies = [
8
- "fastapi>=0.115.0",
9
- "uvicorn[standard]>=0.34.0",
10
- "pydantic>=2.10.0",
11
- "pydantic-settings>=2.7.0",
12
- "langchain-core>=1.2.7",
13
- "langchain-groq>=1.1.1",
14
- "langchain-openai>=1.1.7",
15
- "langchain-text-splitters>=1.1.0",
16
- "llama-cpp-python>=0.3.4",
17
- "sentence-transformers>=3.4.0,<4",
18
- "supabase>=2.27.2",
19
- "transformers>=4.44.0,<4.47",
20
  "python-multipart>=0.0.22",
21
- "torch>=2.5.1",
22
- "torchvision>=0.20.1",
23
- "torchaudio>=2.5.1",
24
  ]
25
 
26
- [dependency-groups]
27
- dev = [
28
- "pytest>=8.0.0",
29
- "pytest-asyncio>=0.24.0",
30
- "httpx>=0.27.0",
31
- "ruff>=0.9.0",
32
- "ipykernel>=7.1.0",
33
- ]
34
-
35
- [tool.uv]
36
- required-environments = ["sys_platform == 'win32'", "sys_platform == 'linux'"]
37
-
38
- [[tool.uv.index]]
39
- name = "llama-cpp-python-cuda"
40
- url = "https://abetlen.github.io/llama-cpp-python/whl/cu121"
41
- explicit = true
42
-
43
  [[tool.uv.index]]
44
- name = "pytorch"
45
- url = "https://download.pytorch.org/whl/cu121"
46
  explicit = true
47
 
48
  [tool.uv.sources]
49
- llama-cpp-python = { index = "llama-cpp-python-cuda" }
50
- torch = { index = "pytorch" }
51
- torchvision = { index = "pytorch" }
52
- torchaudio = { index = "pytorch" }
53
-
54
- [tool.ruff]
55
- line-length = 100
56
- target-version = "py312"
57
-
58
- [tool.ruff.lint]
59
- select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"]
60
-
61
- [tool.ruff.lint.isort]
62
- known-first-party = ["app"]
63
-
64
- [tool.pytest.ini_options]
65
- testpaths = ["tests"]
66
- asyncio_mode = "auto"
 
1
  [project]
2
+ name = "ocr"
3
  version = "0.1.0"
4
+ description = "Add your description here"
5
  readme = "README.md"
6
+ requires-python = ">=3.13"
7
  dependencies = [
8
+ "transformers",
9
+ "torch",
10
+ "torchvision",
11
+ "accelerate>=1.12.0",
12
+ "pydantic-settings>=2.13.1",
13
+ "uvicorn>=0.41.0",
14
+ "fastapi>=0.135.0",
15
+ "supabase>=2.28.0",
16
+ "langchain-core>=1.2.16",
17
+ "langchain-groq>=1.1.2",
18
+ "langchain-text-splitters>=1.1.1",
19
+ "sentence-transformers>=5.2.3",
20
  "python-multipart>=0.0.22",
21
+ "langchain-openai>=1.1.10",
 
 
22
  ]
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  [[tool.uv.index]]
25
+ name = "cu126"
26
+ url = "https://download.pytorch.org/whl/cu126"
27
  explicit = true
28
 
29
  [tool.uv.sources]
30
+ transformers = { git = "https://github.com/huggingface/transformers.git" }
31
+ torch = { index = "cu126" }
32
+ torchvision = { index = "cu126" }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff