Spaces:

Sidoineko
/

CoranIA

Sleeping

App Files Files Community

Sidoine1991 commited on Apr 24, 2025

Commit

3e9d6c3

1 Parent(s): 2809368

Initial commit CoranIA: ajout app, index, README et index zip

Browse files

Files changed (5) hide show

README.md +72 -12
app.py +109 -0
chroma_db_coran.zip +3 -0
index_coran.py +61 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,14 +1,74 @@
 ---
-title: CoranIA
-emoji: 📚
-colorFrom: green
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.44.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: The best tool to study Coranics
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+# CoranIA 📖🤖
+CoranIA est une application RAG (Retrieval-Augmented Generation) pour poser des questions sur le Coran en français.
+Elle utilise :
+- ChromaDB pour l’index documentaire.
+- Google Gemini pour la génération de réponses naturelles.
+- Streamlit pour l’interface.
+## Fonctionnalités
+- Recherche sémantique dans le Coran grâce à ChromaDB.
+- Génération de réponses en langage naturel avec Gemini.
+- Boutons de partage (WhatsApp, Facebook).
+- Extraction automatique de l’index zip.
+- Interface simple et responsive.
+## Structure du projet
+```
+CoranIA/
+├─ app.py               # Application Streamlit
+├─ index_coran.py       # Script d’indexation du PDF du Coran
+├─ coran_french.pdf     # PDF source du Coran
+├─ chroma_db_coran/     # Dossier d’index ChromaDB (après indexation)
+├─ chroma_db_coran.zip  # Archive de l’index (upload sur HF)
+├─ .env                 # Variables d’environnement (API Keys)
+├─ requirements.txt     # Dépendances Python
+└─ README.md            # Documentation du projet
+```
+## Installation et exécution locales
+```bash
+git clone https://huggingface.co/spaces/Sidoineko/CoranIA
+cd CoranIA
+# Créer et activer un venv
+python -m venv venv
+venv\Scripts\activate      # Windows
+# source venv/bin/activate   # Linux/Mac
+# Installer les dépendances
+pip install -r requirements.txt
+```
+### Configurer la clé API
+Créer un fichier `.env` à la racine :
+```ini
+GOOGLE_API_KEY=...
+GEMINI_API_KEY_2=...
+```
+### Générer l’index (si non déjà présent)
+```bash
+python index_coran.py
+```
+### Lancer l’application
+```bash
+streamlit run app.py
+```
+## Déploiement sur Hugging Face Spaces
+1. Zipper le dossier d’index : `chroma_db_coran/` → `chroma_db_coran.zip`
+2. Uploader `chroma_db_coran.zip` dans le repo du Space (Dataset).
+3. Pusher les sources (`app.py`, `index_coran.py`, `requirements.txt`, `README.md`).
+4. Le Space se déploie automatiquement.
+## License
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import zipfile
+import streamlit as st
+import chromadb
+from chromadb.utils import embedding_functions
+import google.generativeai as genai
+from dotenv import load_dotenv
+import urllib.parse
+# Load environment variables
+load_dotenv()
+# --- Configurations ---
+PERSIST_DIRECTORY = "chroma_db_coran"
+INDEX_ZIP_PATH = "chroma_db_coran.zip"
+DB_COLLECTION_NAME = "coran_rag_collection"
+EMBEDDING_MODEL_NAME = "models/text-embedding-004"
+LLM_MODEL_NAME = "models/gemini-1.5-flash-latest"
+# --- UI Header ---
+try:
+    st.image("logo_coran.jpg", width=120)
+except:
+    pass
+st.markdown("<h1 style='text-align:center; color:#2d8659;'>❓ CoranIA</h1>", unsafe_allow_html=True)
+st.markdown("<p style='text-align:center; color:#444;'>📖 Posez une question sur le Coran</p>", unsafe_allow_html=True)
+st.markdown("<hr>", unsafe_allow_html=True)
+# --- Sidebar Configuration ---
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    n_results = st.slider("Nombre de passages à utiliser", 1, 5, 3)
+# --- Ensure Index Exists ---
+if not os.path.exists(PERSIST_DIRECTORY):
+    st.warning(f"🗂️ Index local non trouvé. Extraction de {INDEX_ZIP_PATH}...")
+    try:
+        with zipfile.ZipFile(INDEX_ZIP_PATH, "r") as z:
+            z.extractall(PERSIST_DIRECTORY)
+        st.success("✅ Index extrait avec succès.")
+    except Exception as e:
+        st.error(f"❌ Erreur lors de l'extraction de l'index : {e}")
+        st.stop()
+# --- API Key Setup ---
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY_2")
+if not GOOGLE_API_KEY:
+    st.error("Clé API non trouvée. Définir GOOGLE_API_KEY ou GEMINI_API_KEY_2 dans .env ou secrets.")
+    st.stop()
+genai.configure(api_key=GOOGLE_API_KEY)
+# --- ChromaDB & Embedding ---
+embed_fn = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
+    api_key=GOOGLE_API_KEY, model_name=EMBEDDING_MODEL_NAME
+)
+client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
+try:
+    collection = client.get_collection(name=DB_COLLECTION_NAME, embedding_function=embed_fn)
+except:
+    collection = client.create_collection(name=DB_COLLECTION_NAME, embedding_function=embed_fn)
+# --- Generative Model ---
+@st.cache_resource
+def get_generative_model(model_name):
+    try:
+        return genai.GenerativeModel(model_name)
+    except Exception as e:
+        st.error(f"Erreur initialisation modèle génératif: {e}")
+        return None
+gen_model = get_generative_model(LLM_MODEL_NAME)
+# --- Ask Function ---
+def ask_coran(question: str):
+    if not question:
+        return None, None
+    q_emb = embed_fn([question])[0]
+    results = collection.query(query_embeddings=[q_emb], n_results=n_results)
+    docs = results.get("documents", [[]])[0]
+    context = "\n\n".join(docs)
+    prompt = f"Voici des extraits du Coran :\n{context}\n\nQuestion : {question}\nRéponse :"
+    response = gen_model.generate_content(prompt)
+    return response.text, context
+# --- Main Interaction ---
+question = st.text_input("Votre question sur le Coran", "")
+if st.button("Envoyer"):
+    if not question:
+        st.warning("Entrez une question.")
+    else:
+        with st.spinner("Recherche et génération..."):
+            answer, context = ask_coran(question)
+        if answer:
+            st.success(answer)
+            with st.expander("Passages utilisés"):
+                st.write(context)
+            # --- Share Buttons ---
+            share_text = f"Réponse CoranIA : {answer}"
+            encoded = urllib.parse.quote_plus(share_text)
+            app_url = "https://huggingface.co/spaces/Sidoineko/CoranIA"
+            whatsapp_url = f"https://wa.me/?text={encoded}%20{app_url}"
+            facebook_url = f"https://www.facebook.com/sharer/sharer.php?u={app_url}&quote={encoded}"
+            st.markdown(f"""
+<div style='text-align:center; margin-top:1em;'>
+    <a href="{whatsapp_url}" target="_blank"><img src="https://img.icons8.com/color/48/000000/whatsapp--v1.png" width="32"/></a>
+    <a href="{facebook_url}" target="_blank"><img src="https://img.icons8.com/color/48/000000/facebook-new.png" width="32"/></a>
+</div>
+""", unsafe_allow_html=True)

chroma_db_coran.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc0b63553f9e7a5da3be19275b46318f1179d74e96afebc7f6a44910cc0abdc2
+size 16543212

index_coran.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import time
+import chromadb
+import PyPDF2
+from chromadb.utils import embedding_functions
+from dotenv import load_dotenv
+load_dotenv()
+# --- Paramètres ---
+PDF_PATH = "coran_french.pdf"
+CHROMA_DIR = "chroma_db_coran"
+COLLECTION_NAME = "coran_rag_collection"
+EMBEDDING_MODEL = "models/text-embedding-004"
+CHUNK_SIZE = 500  # caractères
+def extract_chunks(pdf_path, chunk_size=500):
+    print(f"Lecture du PDF : {pdf_path}")
+    with open(pdf_path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        all_text = "\n".join(page.extract_text() or '' for page in reader.pages)
+    chunks = []
+    for i in range(0, len(all_text), chunk_size):
+        chunk = all_text[i:i+chunk_size]
+        if chunk.strip():
+            chunks.append(chunk)
+    print(f"Nombre de chunks générés : {len(chunks)}")
+    return chunks
+if __name__ == "__main__":
+    if not os.path.exists(PDF_PATH):
+        print(f"ERREUR : Le fichier {PDF_PATH} est introuvable dans le dossier courant.")
+        exit(1)
+    print("--- Démarrage de l'indexation du Coran ---")
+    t0 = time.time()
+    chunks = extract_chunks(PDF_PATH, CHUNK_SIZE)
+    client = chromadb.PersistentClient(path=CHROMA_DIR)
+    if COLLECTION_NAME in [c.name for c in client.list_collections()]:
+        print(f"Suppression de l'ancienne collection '{COLLECTION_NAME}'...")
+        client.delete_collection(COLLECTION_NAME)
+    collection = client.create_collection(COLLECTION_NAME)
+    api_key = os.getenv("GEMINI_API_KEY_2=AIzaSyA1_mUCShfPMeCM23nE3gu1jKFJpB9S44U") or os.getenv("GEMINI_API_KEY_2")
+    if not api_key:
+        raise ValueError("Aucune clé API Gemini/Google n'a été trouvée dans GOOGLE_API_KEY ou GEMINI_API_KEY_2.")
+    embedding_fn = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
+        api_key=api_key, model_name=EMBEDDING_MODEL
+    )
+    print("Indexation des chunks dans ChromaDB...")
+    batch_size = 32
+    for i in range(0, len(chunks), batch_size):
+        batch = chunks[i:i+batch_size]
+        ids = [f"chunk_{i+j}" for j in range(len(batch))]
+        try:
+            collection.add(documents=batch, ids=ids, embeddings=embedding_fn(batch))
+        except Exception as e:
+            print(f"Erreur à l'indexation du batch {i//batch_size}: {e}")
+        print(f"Progression : {min(i+batch_size, len(chunks))}/{len(chunks)} chunks indexés")
+    print(f"Indexation terminée en {time.time()-t0:.1f} secondes.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+chromadb==0.4.24
+PyPDF2==3.0.1
+streamlit==1.32.2
+chromadb[all]
+requests
+python-dotenv