"""Model management — loads and runs local GGUF models via llama.cpp."""
from __future__ import annotations

import logging
from pathlib import Path
from typing import Generator

from hivemind.config import ModelConfig

log = logging.getLogger(__name__)


class Model:
    """Wrapper around llama-cpp-python for local inference."""

    def __init__(self, config: ModelConfig):
        self.config = config
        self._llm = None

    def load(self) -> None:
        """Load the GGUF model into memory."""
        from llama_cpp import Llama

        path = Path(self.config.path)
        if not path.exists():
            raise FileNotFoundError(f"Model not found: {path}")

        log.info("Loading model: %s", path.name)
        self._llm = Llama(
            model_path=str(path),
            n_ctx=self.config.n_ctx,
            n_gpu_layers=self.config.n_gpu_layers,
            n_threads=self.config.n_threads or None,
            verbose=False,
        )
        log.info("Model loaded: %s (ctx=%d)", path.name, self.config.n_ctx)

    @property
    def loaded(self) -> bool:
        return self._llm is not None

    def generate(
        self,
        messages: list[dict],
        max_tokens: int = 1024,
        temperature: float = 0.7,
        stream: bool = False,
    ) -> str | Generator[str, None, None]:
        """Generate a response from chat messages.
        
        Args:
            messages: List of {"role": "system"|"user"|"assistant", "content": "..."}
            max_tokens: Maximum tokens to generate
            temperature: Sampling temperature
            stream: If True, yield tokens as they're generated
        """
        if not self._llm:
            raise RuntimeError("Model not loaded. Call load() first.")

        if stream:
            return self._stream(messages, max_tokens, temperature)

        result = self._llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return result["choices"][0]["message"]["content"]

    def _stream(
        self, messages: list[dict], max_tokens: int, temperature: float
    ) -> Generator[str, None, None]:
        """Stream tokens one by one."""
        for chunk in self._llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            stream=True,
        ):
            delta = chunk["choices"][0].get("delta", {})
            token = delta.get("content", "")
            if token:
                yield token

    def load_lora(self, lora_path: str) -> None:
        """Load a LoRA adapter on top of the base model."""
        if not self._llm:
            raise RuntimeError("Base model must be loaded first.")
        path = Path(lora_path)
        if not path.exists():
            raise FileNotFoundError(f"LoRA adapter not found: {path}")
        try:
            self._llm.load_lora(str(path))
            log.info("LoRA adapter loaded: %s", path.name)
        except AttributeError:
            log.warning("llama-cpp-python version doesn't support load_lora()")
        except Exception as e:
            log.error("Failed to load LoRA: %s", e)
            raise

    def embed(self, text: str) -> list[float]:
        """Get embedding vector for text (used for cache similarity)."""
        if not self._llm:
            raise RuntimeError("Model not loaded.")
        # llama.cpp can compute embeddings if model supports it
        # Fallback: use simple hash-based approach
        try:
            result = self._llm.embed(text)
            return result
        except Exception:
            # Fallback: not all models support embeddings
            log.warning("Model doesn't support embeddings, cache similarity disabled")
            return []
