"""Topic/keyword extraction for smart peer routing.

Two strategies — used in sequence:

  1. Fast path (always runs, < 1ms, no model):
     Regex-based stop-word filter. Good enough for routing in most cases.

  2. Model path (optional, async, ~100-500ms):
     Uses the locally loaded model with a tightly constrained prompt
     (max_tokens=50, temperature=0.0) — acts like a tiny dedicated
     keyword model without loading anything extra.
     Falls back to fast path on timeout or error.

Usage in node.py:
    topics = extract_fast(query)                      # immediate
    topics = await extract_with_model(query, model)   # better, async

Peer matching:
    score = score_peer(topics, peer_info)             # 0.0 – 1.0
"""
from __future__ import annotations

import asyncio
import logging
import re
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from hivemind.network.protocol import PeerInfo

log = logging.getLogger(__name__)

# ─── Stop words ──────────────────────────────────────────────────────────────

_STOPWORDS: frozenset[str] = frozenset({
    # Deutsch
    "ich", "du", "er", "sie", "es", "wir", "ihr", "mich", "mir", "dich", "dir",
    "ihn", "ihm", "uns", "euch", "mein", "dein", "sein", "unser", "euer",
    "der", "die", "das", "dem", "den", "des", "ein", "eine", "einer", "einem",
    "eines", "kein", "keine", "keiner", "keinem", "keines",
    "ist", "sind", "war", "waren", "bin", "bist", "hat", "haben", "hatte",
    "wird", "werden", "wurde", "worden", "habe", "habt",
    "und", "oder", "aber", "denn", "weil", "wenn", "als", "wie", "dass",
    "nicht", "nein", "ja", "auch", "noch", "nur", "schon", "immer", "mal",
    "man", "mehr", "viel", "sehr", "gut", "bitte", "danke", "gerne",
    "mit", "von", "bei", "aus", "nach", "zu", "fuer", "für", "auf", "in",
    "an", "über", "ueber", "unter", "vor", "hinter", "neben", "zwischen",
    "durch", "gegen", "ohne", "um", "bis", "seit", "laut", "trotz",
    "kann", "kannst", "könnte", "konnte", "soll", "sollte", "muss", "müssen",
    "musste", "darf", "dürfte", "mag", "möchte", "moechte",
    "was", "wer", "wie", "wo", "wann", "warum", "welche", "welcher", "welches",
    "dieser", "diese", "dieses", "jener", "jene", "jenes",
    "alle", "alles", "jeden", "jeder", "jede",
    "bitte", "mal", "einfach", "halt", "doch", "eigentlich", "wirklich",
    "gibt", "geben", "geht", "gehen", "machen", "macht", "gemacht",
    "sagen", "sagt", "gesagt", "zeigen", "erklaeren", "erklären",
    "helfen", "hilf", "hilft", "brauche", "brauchen", "moechte",
    # English
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us",
    "my", "your", "his", "its", "our", "their", "a", "an", "the",
    "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
    "do", "does", "did", "will", "would", "could", "should", "may", "might",
    "and", "or", "but", "if", "because", "when", "while", "as", "than",
    "not", "no", "yes", "also", "just", "only", "very", "much", "many",
    "with", "from", "by", "at", "in", "on", "to", "for", "of", "about",
    "what", "who", "how", "where", "why", "which",
    "this", "that", "these", "those", "there", "here",
    "can", "please", "thank", "thanks", "want", "need", "help", "make",
    "get", "give", "show", "explain", "tell", "say", "use", "using",
})

_MIN_LEN = 3   # minimum keyword length
_MAX_KEYWORDS = 8

# ─── Fast extractor ───────────────────────────────────────────────────────────

def extract_fast(query: str, max_keywords: int = _MAX_KEYWORDS) -> list[str]:
    """Rule-based keyword extraction — pure Python, no model, < 1 ms.

    - Quoted phrases are extracted first (highest signal)
    - Remaining words are filtered by stop-word list
    - Deduplicates, preserves first-occurrence order
    """
    keywords: list[str] = []
    seen: set[str] = set()

    # Quoted phrases carry strong intent (e.g. "machine learning")
    for phrase in re.findall(r'"([^"]{2,40})"', query):
        k = phrase.strip().lower()
        if k and k not in seen:
            keywords.append(k)
            seen.add(k)

    # Individual words — Unicode-aware, includes German umlauts
    words = re.findall(r"[a-zA-ZäöüÄÖÜß]{3,}", query)
    for word in words:
        w = word.lower()
        if w not in _STOPWORDS and w not in seen and len(w) >= _MIN_LEN:
            keywords.append(w)
            seen.add(w)

    return keywords[:max_keywords]


# ─── Model-based extractor ────────────────────────────────────────────────────

_MODEL_SYSTEM = (
    "Du bist ein Keyword-Extraktor. "
    "Antworte IMMER nur mit einer kommagetrennten Liste von Stichwörtern. "
    "Keine Erklärungen, kein Satztext, nur Keywords."
)

_MODEL_USER = (
    "Extrahiere maximal 6 Themen-Keywords aus dieser Anfrage "
    "(Deutsch oder Englisch, je nachdem was passt):\n\n{query}\n\nKeywords:"
)

_MODEL_TIMEOUT = 5.0   # seconds — tight budget for a routing helper


async def extract_with_model(
    query: str,
    model: object,
    max_keywords: int = 6,
) -> list[str]:
    """Use the locally loaded model as a fast keyword extractor.

    Sends a constrained prompt (max_tokens=50, temperature=0.0) so the model
    only generates a short comma-separated list.  Falls back to extract_fast()
    on timeout or any error.
    """
    if model is None or not getattr(model, "loaded", False):
        return extract_fast(query, max_keywords)

    messages = [
        {"role": "system", "content": _MODEL_SYSTEM},
        {"role": "user", "content": _MODEL_USER.format(query=query[:400])},
    ]

    try:
        loop = asyncio.get_event_loop()
        raw: str = await asyncio.wait_for(
            loop.run_in_executor(
                None,
                lambda: model.generate(
                    messages,
                    max_tokens=50,
                    temperature=0.0,
                    stream=False,
                ),
            ),
            timeout=_MODEL_TIMEOUT,
        )

        # Parse: split on comma or newline, clean up each token
        raw_clean = raw.split("\n")[0]  # take only first line
        candidates = re.split(r"[,;/]", raw_clean)
        keywords: list[str] = []
        seen: set[str] = set()
        for c in candidates:
            k = c.strip().lower()
            # Drop obviously bad outputs (too short, too long, looks like a sentence)
            if 2 < len(k) < 40 and " " not in k or (len(k.split()) <= 3):
                k = k.strip()
            if k and k not in seen and len(k) >= 2:
                keywords.append(k)
                seen.add(k)

        if keywords:
            log.debug("Model topics: %s", keywords[:max_keywords])
            return keywords[:max_keywords]

    except asyncio.TimeoutError:
        log.debug("Topic model timed out — using fast extractor")
    except Exception as exc:
        log.debug("Topic model error (%s) — using fast extractor", exc)

    return extract_fast(query, max_keywords)


# ─── Peer scoring ─────────────────────────────────────────────────────────────

def score_peer(topics: list[str], specialization: str,
               expertise_tags: list[str]) -> float:
    """Score how well a peer's profile matches the given topics (0.0 – 1.0).

    Scoring table per topic:
      Exact expertise_tag match    → +2.0 pts
      Partial tag match            → +0.8 pts  (topic ⊂ tag or tag ⊂ topic)
      Exact match in specialization string → +1.5 pts
      Partial word match in specialization  → +0.5 pts

    Normalised against max achievable score so the result is always in [0, 1].
    A peer with no declared specialization gets 0.5 (neutral — might still help).
    """
    if not specialization and not expertise_tags:
        return 0.5   # no profile → neutral

    if not topics:
        return 0.5   # no topics → neutral

    spec_lower = specialization.lower()
    tags_lower = [t.lower() for t in expertise_tags]
    spec_words = set(re.findall(r"\w+", spec_lower))

    score = 0.0
    for topic in topics:
        t = topic.lower()
        t_words = set(re.findall(r"\w+", t))

        # Exact tag match
        if t in tags_lower:
            score += 2.0
        # Partial tag match
        elif any(t in tag or tag in t for tag in tags_lower):
            score += 0.8

        # Match against specialization string
        if t in spec_lower:
            score += 1.5
        elif t_words & spec_words:
            score += 0.5

    # Normalise: "perfect" = every topic gets 2.0 (exact tag) + 1.5 (spec) = 3.5
    max_possible = len(topics) * 3.5
    return min(1.0, score / max_possible) if max_possible > 0 else 0.0
