Kalpokoch commited on
Commit
61e6651
·
1 Parent(s): 9a000fe

fixes to pytessacrat

Browse files
Files changed (1) hide show
  1. app.py +37 -34
app.py CHANGED
@@ -16,32 +16,35 @@ import fitz
16
  from PIL import Image
17
  import pytesseract
18
  from sentence_transformers import SentenceTransformer
19
- from ctransformers import AutoModel # NEW: For running quantized GGUF models
 
 
 
 
 
20
 
21
  # --- 1. INITIAL SETUP & MODEL LOADING ---
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
26
- app = FastAPI(title="Generative Universal Data AI", version="3.0.0")
27
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
31
  )
32
 
33
- # --- Load Models ---
34
  try:
35
- logger.info("Loading AI models...")
36
- # Model for creating vector embeddings (remains the same)
37
- embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
38
 
39
- # NEW: Loading the quantized Phi-2 model using ctransformers
40
- # This downloads a GGUF model file, optimized for CPU inference.
41
- # Q4_K_M is a good balance of quality and performance.
42
  llm = AutoModel.from_pretrained(
43
- "TheBloke/phi-2-GGUF",
44
- model_file="phi-2.Q4_K_M.gguf"
45
  )
46
  logger.info("AI models loaded successfully.")
47
  except Exception as e:
@@ -54,44 +57,48 @@ SESSION_DATA = {}
54
  # --- 2. DATA MODELS ---
55
  class QueryRequest(BaseModel): question: str
56
  class UploadResponse(BaseModel): session_id: str; filename: str; chunks_created: int
57
- # Modified response to reflect generative model output
58
  class QueryResponse(BaseModel): answer: str; context: str
59
 
60
- # --- 3. HELPER FUNCTIONS --- (No changes here)
61
  def parse_pdf(content: bytes) -> str:
62
  doc = fitz.open(stream=content, filetype="pdf"); return "".join(page.get_text() for page in doc)
 
63
  def parse_image(content: bytes) -> str:
64
  image = Image.open(io.BytesIO(content)); return pytesseract.image_to_string(image)
65
 
66
  # --- 4. API ENDPOINTS ---
67
 
68
  @app.get("/")
69
- def read_root(): return {"status": "ok", "message": "Welcome to the Generative Universal Data AI"}
70
 
71
  @app.post("/upload", response_model=UploadResponse)
72
  async def upload_file(file: UploadFile = File(...)):
73
- # This endpoint remains largely the same, using the BGE model and semantic chunking
74
  if not embedding_model: raise HTTPException(status_code=503, detail="Embedding model not available.")
75
- # ... (the rest of the upload logic is identical to the previous version)
76
  session_id = str(uuid.uuid4())
77
  content = await file.read()
78
  content_type = file.content_type
 
79
  if content_type == "application/pdf": text = parse_pdf(content)
80
  elif content_type and content_type.startswith("image/"): text = parse_image(content)
81
- else: text = content.decode("utf-8")
 
 
82
  if not text.strip(): raise HTTPException(status_code=400, detail="No text could be extracted.")
 
83
  text_chunks = semantic_chunker(text, embedding_model)
84
  if not text_chunks: raise HTTPException(status_code=400, detail="Document too short to be processed.")
 
85
  embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)
86
  serialized_index = create_faiss_index(embeddings)
87
  if not serialized_index: raise HTTPException(status_code=500, detail="Failed to create document index.")
 
88
  SESSION_DATA[session_id] = {"chunks": text_chunks, "index": serialized_index}
89
  logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
90
  return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
91
 
92
  @app.post("/query/{session_id}", response_model=QueryResponse)
93
  async def query_session(session_id: str, request: QueryRequest):
94
- # --- THIS ENDPOINT IS COMPLETELY REWORKED FOR PHI-2 ---
95
  if not llm or not embedding_model:
96
  raise HTTPException(status_code=503, detail="AI models are not available.")
97
 
@@ -99,37 +106,33 @@ async def query_session(session_id: str, request: QueryRequest):
99
  if not session:
100
  raise HTTPException(status_code=404, detail="Session not found.")
101
 
102
- # Step 1: Retrieve relevant context (same as before)
103
  query_with_prefix = f"Represent this sentence for searching relevant passages: {request.question}"
104
  question_embedding = embedding_model.encode([query_with_prefix], convert_to_numpy=True).astype('float32')
 
105
  index = deserialize_faiss_index(session["index"])
106
  if not index: raise HTTPException(status_code=500, detail="Could not load session index.")
 
107
  k = min(5, index.ntotal)
108
  distances, indices = index.search(question_embedding, k)
109
  context = "\n".join([session["chunks"][i] for i in indices[0]])
110
 
111
- # Step 2: Create a specific prompt for the generative model
112
- # This template instructs the model on how to behave.
113
- prompt = f"""
114
- Instruct: Use the following context to answer the question accurately. If the answer is not present in the context, say "The answer is not available in the provided document."
115
-
116
  Context:
117
  {context}
118
 
119
- Question: {request.question}
120
-
121
- Answer:"""
122
 
123
- logger.info("Generating answer with Phi-2...")
124
 
125
- # Step 3: Generate the answer
126
  answer = llm(
127
  prompt,
128
- max_new_tokens=256, # Max length of the answer
129
- temperature=0.2, # Lower temperature for more factual answers
130
- stop=["\n", "Instruct:", "Question:"] # Stop generation at these tokens
131
  )
132
 
133
- # Generative models don't give a confidence 'score' like extractive ones.
134
- # We simply return the generated text.
135
  return {"answer": answer.strip(), "context": context}
 
16
  from PIL import Image
17
  import pytesseract
18
  from sentence_transformers import SentenceTransformer
19
+ from ctransformers import AutoModel
20
+
21
+ # --- THIS IS THE FIX FOR TESSERACT ---
22
+ # Explicitly tell pytesseract where to find the Tesseract OCR engine.
23
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
24
+ # ------------------------------------
25
 
26
  # --- 1. INITIAL SETUP & MODEL LOADING ---
27
 
28
  logging.basicConfig(level=logging.INFO)
29
  logger = logging.getLogger(__name__)
30
 
31
+ app = FastAPI(title="Optimized Universal Data AI", version="3.1.0")
32
 
33
  app.add_middleware(
34
  CORSMiddleware,
35
  allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
36
  )
37
 
38
+ # --- Load Optimized Models ---
39
  try:
40
+ logger.info("Loading optimized AI models...")
41
+ # Using a smaller, but still powerful, BGE model
42
+ embedding_model = SentenceTransformer('BAAI/bge-base-en-v1.5')
43
 
44
+ # Using TinyLlama, which is fast and efficient for CPU
 
 
45
  llm = AutoModel.from_pretrained(
46
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
47
+ model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
48
  )
49
  logger.info("AI models loaded successfully.")
50
  except Exception as e:
 
57
  # --- 2. DATA MODELS ---
58
  class QueryRequest(BaseModel): question: str
59
  class UploadResponse(BaseModel): session_id: str; filename: str; chunks_created: int
 
60
  class QueryResponse(BaseModel): answer: str; context: str
61
 
62
+ # --- 3. HELPER FUNCTIONS ---
63
  def parse_pdf(content: bytes) -> str:
64
  doc = fitz.open(stream=content, filetype="pdf"); return "".join(page.get_text() for page in doc)
65
+
66
  def parse_image(content: bytes) -> str:
67
  image = Image.open(io.BytesIO(content)); return pytesseract.image_to_string(image)
68
 
69
  # --- 4. API ENDPOINTS ---
70
 
71
  @app.get("/")
72
+ def read_root(): return {"status": "ok", "message": "Welcome to the Optimized Universal Data AI"}
73
 
74
  @app.post("/upload", response_model=UploadResponse)
75
  async def upload_file(file: UploadFile = File(...)):
 
76
  if not embedding_model: raise HTTPException(status_code=503, detail="Embedding model not available.")
77
+
78
  session_id = str(uuid.uuid4())
79
  content = await file.read()
80
  content_type = file.content_type
81
+
82
  if content_type == "application/pdf": text = parse_pdf(content)
83
  elif content_type and content_type.startswith("image/"): text = parse_image(content)
84
+ elif file.filename.endswith(('.txt', '.md')): text = content.decode("utf-8")
85
+ else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {content_type}")
86
+
87
  if not text.strip(): raise HTTPException(status_code=400, detail="No text could be extracted.")
88
+
89
  text_chunks = semantic_chunker(text, embedding_model)
90
  if not text_chunks: raise HTTPException(status_code=400, detail="Document too short to be processed.")
91
+
92
  embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)
93
  serialized_index = create_faiss_index(embeddings)
94
  if not serialized_index: raise HTTPException(status_code=500, detail="Failed to create document index.")
95
+
96
  SESSION_DATA[session_id] = {"chunks": text_chunks, "index": serialized_index}
97
  logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
98
  return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
99
 
100
  @app.post("/query/{session_id}", response_model=QueryResponse)
101
  async def query_session(session_id: str, request: QueryRequest):
 
102
  if not llm or not embedding_model:
103
  raise HTTPException(status_code=503, detail="AI models are not available.")
104
 
 
106
  if not session:
107
  raise HTTPException(status_code=404, detail="Session not found.")
108
 
 
109
  query_with_prefix = f"Represent this sentence for searching relevant passages: {request.question}"
110
  question_embedding = embedding_model.encode([query_with_prefix], convert_to_numpy=True).astype('float32')
111
+
112
  index = deserialize_faiss_index(session["index"])
113
  if not index: raise HTTPException(status_code=500, detail="Could not load session index.")
114
+
115
  k = min(5, index.ntotal)
116
  distances, indices = index.search(question_embedding, k)
117
  context = "\n".join([session["chunks"][i] for i in indices[0]])
118
 
119
+ # Correct prompt format for TinyLlama Chat
120
+ prompt = f"""<|im_start|>user
121
+ Use the following context to answer the question.
 
 
122
  Context:
123
  {context}
124
 
125
+ Question: {request.question}<|im_end|>
126
+ <|im_start|>assistant
127
+ """
128
 
129
+ logger.info("Generating answer with TinyLlama...")
130
 
 
131
  answer = llm(
132
  prompt,
133
+ max_new_tokens=256,
134
+ temperature=0.3,
135
+ stop=["<|im_end|>"]
136
  )
137
 
 
 
138
  return {"answer": answer.strip(), "context": context}