import os
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
import requests
import shutil
import PyPDF2
from docx import Document
import openpyxl
import zipfile

from flask import Flask, request, jsonify
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate

app = Flask(__name__)

# Load environment variables
persist_directory = os.environ.get('PERSIST_DIRECTORY')
#source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
chunk_size = 1000
chunk_overlap = 200


# Custom document loaders
class MyElmLoader(UnstructuredEmailLoader):
    """Wrapper to fallback to text/plain when default does not work"""

    def load(self) -> List[Document]:
        """Wrapper adding fallback for elm without html"""
        try:
            try:
                doc = UnstructuredEmailLoader.load(self)
            except ValueError as e:
                if 'text/html content not found in email' in str(e):
                    # Try plain text
                    self.unstructured_kwargs["content_source"]="text/plain"
                    doc = UnstructuredEmailLoader.load(self)
                else:
                    raise
        except Exception as e:
            # Add file_path to exception message
            raise type(e)(f"{self.file_path}: {e}") from e

        return doc


# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".eml": (MyElmLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyMuPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}

def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.rsplit(".", 1)[-1].lower()
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()

    raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
        )
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
        )
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                pbar.update()

    return results

def process_documents(source_directory, ignored_files: List[str] = []) -> List[Document]:
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.split_documents(documents)
    print(f"Split into {len(documents)} chunks of text (max. {chunk_size} tokens each)")
    return documents

def download_and_process_files(folder_name, url_mapping):
    # Create the download folder if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    for url, file_name in url_mapping.items():
        response = requests.get(url)

        if response.status_code == 200:
            file_path = os.path.join(folder_name, file_name)
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f'Downloaded {file_name} successfully.')

        else:
            print(f"Failed to download {file_name}. Status code: {response.status_code}")
    return "Success"

@app.route('/save_embeddings', methods=['POST'])
def save_embeddings_to_db():
    try:
        #url_mapping = {
        #    'https://travellingcoaches.s3.us-west-1.amazonaws.com/lms-updated/super/reference_material/EXCL19200QuickReferenceCard_e0b6bc9.docx': 'document1.docx',
        #    'https://travellingcoaches.s3.us-west-1.amazonaws.com/lms-updated/super/reference_material/05c493e2a7c03a8a4e4e6fdca3e399e6.pdf': 'document2.pdf'
        #}
        #download_and_process_files(folder_name, url_mapping)
        data = request.get_json()
        #text = data['text']
        folder_name = data['folder']
        api_key = data['api_key']
        
        print(folder_name)
        folder_path = data['folder_path']
        #name = data.get('name', '') 
        documents = process_documents(folder_path)
        embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory="chromadb/" + folder_name)
        vectordb.persist()

        return jsonify({'message': 'Embeddings processed and saved', 'status' : 'success'})
    except Exception as e:
        return jsonify({'error': str(e)})


@app.route('/get_responses', methods=['POST'])
def get_responses():
    try:
        data = request.get_json()
        query = data.get('query', '') 
        api_key = data.get('api_key', '') 
        context = data.get('context', '') 
        question = query
        persist_dir = data.get('folder')

        _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""
        CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

        embeddings = OpenAIEmbeddings(openai_api_key=api_key)

        db = Chroma(persist_directory="chromadb/" + persist_dir, embedding_function=embeddings)
        vectorstore = db.as_retriever(search_kwargs={"k": 3})

        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        qa = ConversationalRetrievalChain.from_llm(
            OpenAI(openai_api_key=api_key, model_name="gpt-3.5-turbo", temperature=0), 
            vectorstore, 
            condense_question_prompt=CONDENSE_QUESTION_PROMPT,
            verbose=True
        )

        chat_history = []
        for item in context:
            chat_history.append((item.get('question'), item.get('answer'))) 

        print(chat_history)
        result = qa({"question": question, "chat_history" : chat_history})
        return result["answer"]

    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)
