Kalpokoch commited on
Commit
9a000fe
·
1 Parent(s): 01eba2b

updated app.py incldued phi2

Browse files
Files changed (5) hide show
  1. app.py +76 -122
  2. core/__init__.py +0 -0
  3. core/chunking.py +46 -0
  4. core/vector_store.py +38 -0
  5. requirements.txt +7 -4
app.py CHANGED
@@ -1,181 +1,135 @@
1
- #
2
- # ---------------- Universal Data AI ----------------
3
- #
4
- # Final app.py script (v3) with robust FAISS I/O
5
- # Corrects previous serialization errors.
6
- #
7
- # Last updated: August 8, 2025
8
- #
9
 
10
  import logging
11
  import uuid
12
- import io # Ensure io is imported
13
-
14
- # FastAPI & Pydantic
15
  from fastapi import FastAPI, UploadFile, File, HTTPException
16
  from fastapi.middleware.cors import CORSMiddleware
17
  from pydantic import BaseModel
18
 
19
- # Parsing Libraries
20
- import fitz # PyMuPDF
 
 
 
 
21
  from PIL import Image
22
  import pytesseract
23
-
24
- # AI & Search Libraries
25
- import numpy as np
26
- import faiss
27
  from sentence_transformers import SentenceTransformer
28
- from transformers import pipeline
29
 
30
  # --- 1. INITIAL SETUP & MODEL LOADING ---
31
 
32
  logging.basicConfig(level=logging.INFO)
33
  logger = logging.getLogger(__name__)
34
 
35
- app = FastAPI(
36
- title="Universal Data AI",
37
- description="Ephemeral data analysis tool with in-memory vector search.",
38
- version="1.0.1", # Version bump
39
- )
40
 
41
  app.add_middleware(
42
  CORSMiddleware,
43
- allow_origins=["*"],
44
- allow_credentials=True,
45
- allow_methods=["*"],
46
- allow_headers=["*"],
47
  )
48
 
 
49
  try:
50
  logger.info("Loading AI models...")
51
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
52
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
 
 
 
 
 
 
 
 
53
  logger.info("AI models loaded successfully.")
54
  except Exception as e:
55
  logger.critical(f"Fatal error: Could not load AI models. {e}")
56
  embedding_model = None
57
- qa_pipeline = None
58
 
59
  SESSION_DATA = {}
60
 
61
  # --- 2. DATA MODELS ---
 
 
 
 
62
 
63
- class QueryRequest(BaseModel):
64
- question: str
65
-
66
- class UploadResponse(BaseModel):
67
- session_id: str
68
- filename: str
69
- chunks_created: int
70
-
71
- class QueryResponse(BaseModel):
72
- answer: str
73
- score: float
74
- context: str
75
-
76
- # --- 3. HELPER FUNCTIONS ---
77
-
78
  def parse_pdf(content: bytes) -> str:
79
- doc = fitz.open(stream=content, filetype="pdf")
80
- return "".join(page.get_text() for page in doc)
81
-
82
  def parse_image(content: bytes) -> str:
83
- image = Image.open(io.BytesIO(content))
84
- return pytesseract.image_to_string(image)
85
-
86
- def chunk_text(text: str, chunk_size: int = 256, overlap: int = 32) -> list[str]:
87
- words = text.split()
88
- if not words: return []
89
- return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]
90
-
91
- # --- THIS FUNCTION IS CORRECTED ---
92
- def deserialize_index(serialized_index: bytes) -> faiss.Index:
93
- """
94
- Loads a FAISS index from its byte representation using a robust method.
95
- """
96
- try:
97
- bio = io.BytesIO(serialized_index)
98
- # Use PyCallbackIOReader to read from the in-memory binary stream
99
- reader = faiss.PyCallbackIOReader(bio.read)
100
- return faiss.read_index(reader)
101
- except Exception as e:
102
- logger.error(f"Failed to deserialize FAISS index: {e}")
103
- raise
104
 
105
  # --- 4. API ENDPOINTS ---
106
 
107
  @app.get("/")
108
- def read_root():
109
- return {"status": "ok", "message": "Welcome to Universal Data AI"}
110
 
111
  @app.post("/upload", response_model=UploadResponse)
112
  async def upload_file(file: UploadFile = File(...)):
113
- if not embedding_model:
114
- raise HTTPException(status_code=503, detail="AI models are not available.")
115
-
116
  session_id = str(uuid.uuid4())
117
- logger.info(f"Upload received for session {session_id}: {file.filename}")
118
  content = await file.read()
119
-
120
  content_type = file.content_type
121
  if content_type == "application/pdf": text = parse_pdf(content)
122
  elif content_type and content_type.startswith("image/"): text = parse_image(content)
123
- elif content_type == "text/plain": text = content.decode("utf-8")
124
- else: raise HTTPException(status_code=400, detail=f"Unsupported file type: {content_type}")
125
-
126
- if not text.strip():
127
- raise HTTPException(status_code=400, detail="No text could be extracted from the file.")
128
-
129
- text_chunks = chunk_text(text)
130
- if not text_chunks:
131
- raise HTTPException(status_code=400, detail="Document too short to be processed.")
132
-
133
- embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True).astype('float32')
134
- index = faiss.IndexFlatL2(embeddings.shape[1])
135
- index.add(embeddings)
136
-
137
- # --- THIS SECTION IS CORRECTED ---
138
- try:
139
- # Use PyCallbackIOWriter to write the index to an in-memory binary stream
140
- bio = io.BytesIO()
141
- writer = faiss.PyCallbackIOWriter(bio.write)
142
- faiss.write_index(index, writer)
143
- serialized_index = bio.getvalue()
144
- except Exception as e:
145
- logger.error(f"Failed to serialize FAISS index: {e}")
146
- raise HTTPException(status_code=500, detail="Failed to create document index.")
147
-
148
- SESSION_DATA[session_id] = {
149
- "chunks": text_chunks,
150
- "index": serialized_index, # Store the index as bytes
151
- }
152
-
153
  logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
154
  return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
155
 
156
  @app.post("/query/{session_id}", response_model=QueryResponse)
157
  async def query_session(session_id: str, request: QueryRequest):
158
- if not qa_pipeline or not embedding_model:
 
159
  raise HTTPException(status_code=503, detail="AI models are not available.")
160
-
161
  session = SESSION_DATA.get(session_id)
162
  if not session:
163
  raise HTTPException(status_code=404, detail="Session not found.")
164
 
165
- index = deserialize_index(session["index"])
166
- question_embedding = embedding_model.encode([request.question]).astype('float32')
167
-
168
- k = min(3, index.ntotal)
 
 
169
  distances, indices = index.search(question_embedding, k)
170
-
171
- relevant_chunks = [session["chunks"][i] for i in indices[0]]
172
- context = " ".join(relevant_chunks)
173
 
174
- result = qa_pipeline(question=request.question, context=context)
 
 
 
175
 
176
- logger.info(f"Query for session {session_id} answered with score: {result['score']:.4f}")
177
- return {
178
- "answer": result["answer"],
179
- "score": result["score"],
180
- "context": context,
181
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
 
 
 
 
 
 
 
2
 
3
  import logging
4
  import uuid
5
+ import io
 
 
6
  from fastapi import FastAPI, UploadFile, File, HTTPException
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
 
10
+ # Import from our core modules
11
+ from core.chunking import semantic_chunker
12
+ from core.vector_store import create_faiss_index, deserialize_faiss_index
13
+
14
+ # Parsing and AI libraries
15
+ import fitz
16
  from PIL import Image
17
  import pytesseract
 
 
 
 
18
  from sentence_transformers import SentenceTransformer
19
+ from ctransformers import AutoModel # NEW: For running quantized GGUF models
20
 
21
  # --- 1. INITIAL SETUP & MODEL LOADING ---
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
26
+ app = FastAPI(title="Generative Universal Data AI", version="3.0.0")
 
 
 
 
27
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
+ allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 
 
 
31
  )
32
 
33
+ # --- Load Models ---
34
  try:
35
  logger.info("Loading AI models...")
36
+ # Model for creating vector embeddings (remains the same)
37
+ embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
38
+
39
+ # NEW: Loading the quantized Phi-2 model using ctransformers
40
+ # This downloads a GGUF model file, optimized for CPU inference.
41
+ # Q4_K_M is a good balance of quality and performance.
42
+ llm = AutoModel.from_pretrained(
43
+ "TheBloke/phi-2-GGUF",
44
+ model_file="phi-2.Q4_K_M.gguf"
45
+ )
46
  logger.info("AI models loaded successfully.")
47
  except Exception as e:
48
  logger.critical(f"Fatal error: Could not load AI models. {e}")
49
  embedding_model = None
50
+ llm = None
51
 
52
  SESSION_DATA = {}
53
 
54
  # --- 2. DATA MODELS ---
55
+ class QueryRequest(BaseModel): question: str
56
+ class UploadResponse(BaseModel): session_id: str; filename: str; chunks_created: int
57
+ # Modified response to reflect generative model output
58
+ class QueryResponse(BaseModel): answer: str; context: str
59
 
60
+ # --- 3. HELPER FUNCTIONS --- (No changes here)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def parse_pdf(content: bytes) -> str:
62
+ doc = fitz.open(stream=content, filetype="pdf"); return "".join(page.get_text() for page in doc)
 
 
63
  def parse_image(content: bytes) -> str:
64
+ image = Image.open(io.BytesIO(content)); return pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  # --- 4. API ENDPOINTS ---
67
 
68
  @app.get("/")
69
+ def read_root(): return {"status": "ok", "message": "Welcome to the Generative Universal Data AI"}
 
70
 
71
  @app.post("/upload", response_model=UploadResponse)
72
  async def upload_file(file: UploadFile = File(...)):
73
+ # This endpoint remains largely the same, using the BGE model and semantic chunking
74
+ if not embedding_model: raise HTTPException(status_code=503, detail="Embedding model not available.")
75
+ # ... (the rest of the upload logic is identical to the previous version)
76
  session_id = str(uuid.uuid4())
 
77
  content = await file.read()
 
78
  content_type = file.content_type
79
  if content_type == "application/pdf": text = parse_pdf(content)
80
  elif content_type and content_type.startswith("image/"): text = parse_image(content)
81
+ else: text = content.decode("utf-8")
82
+ if not text.strip(): raise HTTPException(status_code=400, detail="No text could be extracted.")
83
+ text_chunks = semantic_chunker(text, embedding_model)
84
+ if not text_chunks: raise HTTPException(status_code=400, detail="Document too short to be processed.")
85
+ embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)
86
+ serialized_index = create_faiss_index(embeddings)
87
+ if not serialized_index: raise HTTPException(status_code=500, detail="Failed to create document index.")
88
+ SESSION_DATA[session_id] = {"chunks": text_chunks, "index": serialized_index}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  logger.info(f"Session {session_id} created with {len(text_chunks)} chunks.")
90
  return {"session_id": session_id, "filename": file.filename, "chunks_created": len(text_chunks)}
91
 
92
  @app.post("/query/{session_id}", response_model=QueryResponse)
93
  async def query_session(session_id: str, request: QueryRequest):
94
+ # --- THIS ENDPOINT IS COMPLETELY REWORKED FOR PHI-2 ---
95
+ if not llm or not embedding_model:
96
  raise HTTPException(status_code=503, detail="AI models are not available.")
97
+
98
  session = SESSION_DATA.get(session_id)
99
  if not session:
100
  raise HTTPException(status_code=404, detail="Session not found.")
101
 
102
+ # Step 1: Retrieve relevant context (same as before)
103
+ query_with_prefix = f"Represent this sentence for searching relevant passages: {request.question}"
104
+ question_embedding = embedding_model.encode([query_with_prefix], convert_to_numpy=True).astype('float32')
105
+ index = deserialize_faiss_index(session["index"])
106
+ if not index: raise HTTPException(status_code=500, detail="Could not load session index.")
107
+ k = min(5, index.ntotal)
108
  distances, indices = index.search(question_embedding, k)
109
+ context = "\n".join([session["chunks"][i] for i in indices[0]])
 
 
110
 
111
+ # Step 2: Create a specific prompt for the generative model
112
+ # This template instructs the model on how to behave.
113
+ prompt = f"""
114
+ Instruct: Use the following context to answer the question accurately. If the answer is not present in the context, say "The answer is not available in the provided document."
115
 
116
+ Context:
117
+ {context}
118
+
119
+ Question: {request.question}
120
+
121
+ Answer:"""
122
+
123
+ logger.info("Generating answer with Phi-2...")
124
+
125
+ # Step 3: Generate the answer
126
+ answer = llm(
127
+ prompt,
128
+ max_new_tokens=256, # Max length of the answer
129
+ temperature=0.2, # Lower temperature for more factual answers
130
+ stop=["\n", "Instruct:", "Question:"] # Stop generation at these tokens
131
+ )
132
+
133
+ # Generative models don't give a confidence 'score' like extractive ones.
134
+ # We simply return the generated text.
135
+ return {"answer": answer.strip(), "context": context}
core/__init__.py ADDED
File without changes
core/chunking.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/chunking.py
2
+
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def semantic_chunker(text: str, model: SentenceTransformer, similarity_threshold: float = 0.55):
11
+ """
12
+ Splits text into chunks based on semantic similarity of sentences.
13
+ """
14
+ logger.info("Starting semantic chunking...")
15
+ # First, split the document into sentences. A simple split by newline and period.
16
+ sentences = [s.strip() for s in text.replace("\n", ". ").split(".") if s.strip()]
17
+ if not sentences:
18
+ return []
19
+
20
+ # Generate embeddings for each sentence
21
+ embeddings = model.encode(sentences, convert_to_numpy=True)
22
+
23
+ chunks = []
24
+ current_chunk_sentences = [sentences[0]]
25
+
26
+ for i in range(1, len(sentences)):
27
+ # Calculate similarity between the current sentence and the previous one
28
+ similarity = cosine_similarity(
29
+ embeddings[i].reshape(1, -1),
30
+ embeddings[i-1].reshape(1, -1)
31
+ )[0, 0]
32
+
33
+ # If similarity is below the threshold, it's a semantic break.
34
+ # Finalize the current chunk and start a new one.
35
+ if similarity < similarity_threshold:
36
+ chunks.append(" ".join(current_chunk_sentences))
37
+ current_chunk_sentences = []
38
+
39
+ current_chunk_sentences.append(sentences[i])
40
+
41
+ # Add the last remaining chunk
42
+ if current_chunk_sentences:
43
+ chunks.append(" ".join(current_chunk_sentences))
44
+
45
+ logger.info(f"Semantic chunking resulted in {len(chunks)} chunks.")
46
+ return chunks
core/vector_store.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/vector_store.py
2
+
3
+ import faiss
4
+ import io
5
+ import logging
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def create_faiss_index(embeddings):
11
+ """Creates a FAISS index from a list of embeddings."""
12
+ if embeddings is None or len(embeddings) == 0:
13
+ logger.warning("No embeddings provided to create FAISS index.")
14
+ return None
15
+
16
+ dimension = embeddings.shape[1]
17
+ index = faiss.IndexFlatL2(dimension)
18
+ index.add(embeddings.astype('float32'))
19
+
20
+ # Serialize the index to bytes for in-memory storage
21
+ try:
22
+ bio = io.BytesIO()
23
+ writer = faiss.PyCallbackIOWriter(bio.write)
24
+ faiss.write_index(index, writer)
25
+ return bio.getvalue()
26
+ except Exception as e:
27
+ logger.error(f"Failed to serialize FAISS index: {e}")
28
+ return None
29
+
30
+ def deserialize_faiss_index(index_bytes: bytes) -> faiss.Index:
31
+ """Deserializes a FAISS index from bytes."""
32
+ try:
33
+ bio = io.BytesIO(index_bytes)
34
+ reader = faiss.PyCallbackIOReader(bio.read)
35
+ return faiss.read_index(reader)
36
+ except Exception as e:
37
+ logger.error(f"Failed to deserialize FAISS index: {e}")
38
+ return None
requirements.txt CHANGED
@@ -1,10 +1,13 @@
 
1
  fastapi
2
- uvicorn[standard]
3
  python-multipart
4
-
5
  PyMuPDF
6
  Pillow
7
  pytesseract
8
-
9
  sentence-transformers
10
- faiss-cpu
 
 
 
 
1
+ ctransformers>=0.2.27
2
  fastapi
3
+ uvicorn
4
  python-multipart
5
+ pydantic
6
  PyMuPDF
7
  Pillow
8
  pytesseract
 
9
  sentence-transformers
10
+ faiss-cpu
11
+ transformers
12
+ torch
13
+ scikit-learn