import streamlit as st from docx import Document from transformers import pipeline from langdetect import detect import spacy import os # Check if the spaCy model exists, if not, download it model_name = "en_core_web_sm" if not spacy.util.is_package(model_name): os.system(f"python -m spacy download {model_name}") nlp = spacy.load(model_name) # Load Llama 3 summarization model llama_summarizer = pipeline("summarization", model="meta-llama/Meta-Llama-3-8B") # Load Gemma 2-9B-IT for recommendations gemma_recommender = pipeline("text-generation", model="google/gemma-2b-it") # Function to extract text from a DOCX file def extract_text_from_docx(docx_file): doc = Document(docx_file) text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) return text # Function to detect document language def detect_language(text): return detect(text) # Function to extract metadata def extract_metadata(text): doc = nlp(text) word_count = len(text.split()) entities = {ent.label_: ent.text for ent in doc.ents} return { "Word Count": word_count, "Entities": entities } # Function to generate abstract (summary) using Llama 3 def generate_summary(text): summary = llama_summarizer(text, max_length=200, min_length=50, do_sample=False) return summary[0]['summary_text'] # Function to generate recommendations using Gemma 2-9B-IT def generate_recommendations(text): prompt = f"Provide three key recommendations based on the following document:\n{text}\n\nRecommendations:" recommendations = gemma_recommender(prompt, max_length=300, num_return_sequences=1, do_sample=False) return recommendations[0]['generated_text'] # Streamlit UI st.title("📄 AI-Powered Multi-Language Document Analyzer") uploaded_file = st.file_uploader("Upload a Word Document", type=["docx"]) if uploaded_file: st.success("File uploaded successfully!") # Extract text doc_text = extract_text_from_docx(uploaded_file) # Detect language language = detect_language(doc_text) # Extract metadata metadata = extract_metadata(doc_text) st.subheader("Extracted Text:") st.text_area("Document Content", doc_text, height=250) st.subheader("🗣️ Detected Language:") st.write(language) st.subheader("📊 Metadata:") st.json(metadata) if st.button("Generate Abstract & Recommendations"): with st.spinner("Analyzing..."): summary = generate_summary(doc_text) recommendations = generate_recommendations(doc_text) st.subheader("📌 Abstract (Summary) - Llama 3") st.write(summary) st.subheader("✅ Recommendations - Gemma 2-9B-IT") st.write(recommendations)