"""Global Memory — persistent facts about the user.

Automatically extracts and stores important information from conversations.
Injected into every session as system context.
"""
from __future__ import annotations

import json
import logging
import re
import time
from pathlib import Path

log = logging.getLogger(__name__)


# ─── Fact Extraction Engine ──────────────────────────────────

# Each pattern: (regex, category, group_index)
# Patterns are tried in order; first match per category wins.
# Use (?:...) groups liberally; capture group is always group(1) unless specified.

_FACT_PATTERNS: list[tuple[str, str]] = [
    # ── Name ──
    # "Ich heiße Reiner Michael", "Mein Name ist Glymax", 'Mein Name ist "Reiner Michael"'
    # Capture everything after trigger up to separator, clean in post-processing
    (r'(?:ich hei[sß]e|mein name ist|mein name lautet|i\'?\s?m|my name is|man nennt mich)\s+'
     r'(?:[uü]brigens?\w*\s+)?'
     r'"?(.+?)"?(?:\s+und\b|\s+bin\b|\s+aus\b|\s*[.,!?]|\s*$)',
     "name"),
    # "Nenn mich Max"
    (r'(?:nenn mich|call me)\s+"?(\w[\w\s]*?\w|\w+)"?', "spitzname"),

    # ── Alter ──
    # "bin 40 Jahre alt", "ich bin 40", "I am 25 years old"
    (r'(?:ich bin|i am|i\'?\s?m)\s+(\d{1,3})\s*(?:jahre?\s*alt|years?\s*old|j\.?\s*a\.?)', "alter"),
    # Also match without "alt": "bin 40" when followed by comma/period/end or more clauses
    (r'(?:ich bin|i am)\s+(\d{1,3})(?:\s*[,.]|\s+und\b|\s*$)', "alter"),

    # ── Wohnort ──
    # "komme aus Thüringen", "wohne in Berlin", "lebe in München"
    (r'(?:ich (?:komme|stamme|bin) aus|ich (?:wohne|lebe) in|i(?:\'m| am) from|i live in)\s+'
     r'([A-Z\u00c0-\u00ff][\w\u00e0-\u00ff]+(?:[\s-][A-Z\u00c0-\u00ff][\w\u00e0-\u00ff]+)*)',
     "wohnort"),

    # ── Beruf / Rolle ──
    # "ich bin Programmierer", "bin leidenschaftlicher Programmierer"
    # "ich arbeite als Software-Entwickler", "ich bin der Entwickler von dir"
    (r'(?:ich arbeite als|ich bin\s+(?:von beruf|beruflich)\s*(?:ein[e]?\s+)?|'
     r'mein beruf ist|i work as(?: an?)?)\s+'
     r'([\w\u00e0-\u00ff][\w\u00e0-\u00ff\s-]{1,40}?)(?:\.|,|!|\s+und\b|\s+au[sß]|\s*$)',
     "beruf"),
    # "bin leidenschaftlicher/begeisterter/... <Beruf>"
    (r'(?:ich )?bin\s+(?:ein\s+)?(?:leidenschaftlich(?:e[r]?)?|begeistert(?:e[r]?)?|'
     r'passioniert(?:e[r]?)?|professionell(?:e[r]?)?)\s*[r]?\s+'
     r'([\w\u00e0-\u00ff][\w\u00e0-\u00ff\s-]{1,30}?)(?:\.|,|!|\s+und\b|\s+au[sß]|\s*$)',
     "beruf"),
    # "ich bin der Entwickler von ..."
    (r'ich bin (?:der|die|das)\s+([\w\u00e0-\u00ff-]+(?:(?:in|er)\b)?)\s+von\b',
     "rolle"),

    # ── Beziehung zum Bot ──
    # "ich bin dein Entwickler/Creator/Schöpfer"
    (r'ich (?:bin|hab)\s+(?:dein[e]?\s+|der\s+|die\s+)?'
     r'(entwickler|creator|sch[oö]pfer|macher|erfinder|admin|chef|boss)\s*(?:von dir)?',
     "beziehung"),
    # "ich habe dich entwickelt/gebaut/programmiert"
    (r'ich hab(?:e)?\s+dich\s+(entwickelt|gebaut|programmiert|erstellt|gemacht)',
     "beziehung"),

    # ── Vorlieben / Abneigungen ──
    # "ich mag Pizza", "ich liebe Katzen", "ich bin Fan von ..."
    (r'(?:ich mag|i like|ich liebe|i love|ich bin fan von|ich steh(?:e)? auf)\s+'
     r'(.+?)(?:\.|,|!|\s+und\b|\s+au[sß]er|\s*$)',
     "vorliebe"),
    (r'(?:ich mag (?:kein(?:e[n]?)?|nicht)|i don\'?t like|ich hasse|i hate)\s+'
     r'(.+?)(?:\.|,|!|\s+und\b|\s*$)',
     "abneigung"),

    # ── Hobbys ──
    # "mein Hobby ist ...", "meine Hobbys sind ..."
    (r'(?:mein(?:e)? hobbys? (?:ist|sind)|ich mache? gerne?|in meiner freizeit)\s+'
     r'(.+?)(?:\.|!|\s*$)',
     "hobbys"),

    # ── Favoriten ──
    # "meine Lieblingssprache ist Python", "my favorite color is blue"
    (r'(?:mein(?:e)?\s+lieblings(\w+)\s+ist|my\s+fav(?:ou?rite)?\s+(\w+)\s+is)\s+'
     r'(.+?)(?:\.|,|!|\s*$)',
     "_favorit_dynamic"),

    # ── Sprachen ──
    (r'(?:ich spreche|i speak|ich kann)\s+(.+?)(?:\.|!|\s*$)', "sprachen"),
    # "meine Muttersprache ist Deutsch"
    (r'(?:meine muttersprache ist|my (?:native|first) language is)\s+(\w+)', "muttersprache"),

    # ── Geburtstag ──
    (r'(?:mein geburtstag ist(?: am)?|ich habe? am|my birthday is)\s+'
     r'(\d{1,2}[\./]\s*\d{1,2}(?:[\./]\s*\d{2,4})?|\d{1,2}\.\s*\w+)',
     "geburtstag"),
    # "ich bin am 15.03. geboren"
    (r'(?:ich bin am)\s+(\d{1,2}[\./]\s*\d{1,2}(?:[\./]\s*\d{2,4})?)\s+geboren', "geburtstag"),

    # ── Haustiere ──
    (r'(?:ich habe?\s+(?:eine?n?\s+)?|mein(?:e)?\s+)'
     r'(hund|katze|hamster|vogel|fisch|kaninchen|pferd|schildkr[oö]te|schlange|papagei)'
     r'(?:\s+(?:namens?|hei[sß]t)\s+(\w+))?',
     "haustier"),
]

# Labels for display
_CATEGORY_LABELS = {
    "name": "Name",
    "spitzname": "Spitzname",
    "alter": "Alter",
    "wohnort": "Wohnort",
    "beruf": "Beruf",
    "rolle": "Rolle",
    "beziehung": "Beziehung",
    "vorliebe": "Mag",
    "abneigung": "Mag nicht",
    "hobbys": "Hobbys",
    "sprachen": "Sprachen",
    "muttersprache": "Muttersprache",
    "geburtstag": "Geburtstag",
    "haustier": "Haustier",
}

# Categories that accumulate (append) rather than replace
_LIST_CATEGORIES = {"vorliebe", "abneigung", "hobbys", "sprachen"}


def _split_clauses(text: str) -> list[str]:
    """Split compound sentences into clauses for better pattern matching.
    
    "Ich heiße Max und bin 40 Jahre alt, komme aus Berlin"
    → ["Ich heiße Max", "bin 40 Jahre alt", "komme aus Berlin"]
    
    Returns both the original text AND individual clauses.
    """
    # Split on ", " and " und " and " außerdem " etc. but keep context
    clauses = [text]  # Always try the full text first
    
    # Split on common clause separators
    parts = re.split(
        r'\s*(?:,\s+|\s+und\s+|\s+au[sß]erdem\s+|\s+sowie\s+|\s+aber\s+|\s+also\s+)',
        text
    )
    
    # Add "ich" prefix to clauses that start with a verb (implicit subject)
    for part in parts:
        part = part.strip()
        if not part:
            continue
        # If clause starts with a verb-like word (bin, komme, habe, arbeite...)
        # and doesn't already have a subject, prepend "ich"
        if re.match(r'^(?:bin|komme|stamme|wohne|lebe|habe|arbeite|spreche|mag|liebe|hasse|hei[sß]e)\b', 
                     part, re.IGNORECASE) and not re.match(r'^ich\b', part, re.IGNORECASE):
            part = "ich " + part
        clauses.append(part)
    
    return clauses


class GlobalMemory:
    """Persistent memory that spans all sessions."""

    def __init__(self, data_dir: str | Path = "data"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.memory_path = self.data_dir / "memory.json"
        self._facts: dict[str, str] = {}
        self._custom: list[str] = []
        self._updated = 0.0
        self._load()

    def _load(self):
        if self.memory_path.exists():
            try:
                data = json.loads(self.memory_path.read_text(encoding="utf-8"))
                self._facts = data.get("facts", {})
                self._custom = data.get("custom", [])
                self._updated = data.get("updated", 0)
            except Exception as e:
                log.warning("Failed to load memory: %s", e)

    def _save(self):
        self._updated = time.time()
        data = {
            "facts": self._facts,
            "custom": self._custom,
            "updated": self._updated,
        }
        self.memory_path.write_text(
            json.dumps(data, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

    def extract_facts(self, text: str) -> list[tuple[str, str]]:
        """Extract facts from user text using pattern matching.
        
        Splits compound sentences into clauses first, then matches patterns.
        Returns list of (category, value) tuples for newly found facts.
        """
        found: list[tuple[str, str]] = []
        seen_categories: set[str] = set()

        clauses = _split_clauses(text)

        for clause in clauses:
            clause_stripped = clause.strip()
            if len(clause_stripped) < 3:
                continue

            for pattern, category in _FACT_PATTERNS:
                if category in seen_categories and category not in _LIST_CATEGORIES:
                    continue

                match = re.search(pattern, clause_stripped, re.IGNORECASE)
                if not match:
                    continue

                # Handle dynamic favorit category
                if category == "_favorit_dynamic":
                    # Figure out the favorit type from the match
                    fav_type = match.group(1) or match.group(2) or "sache"
                    value = match.group(3).strip().strip('"\'')
                    category = f"lieblings{fav_type.lower()}"
                    _CATEGORY_LABELS.setdefault(category, f"Lieblings-{fav_type.capitalize()}")
                else:
                    value = match.group(1).strip().strip('"\'')

                # Special handling for "name" — clean up captured text
                if category == "name":
                    # Remove trailing noise words
                    value = re.sub(
                        r'\s+(?:und|and|bin|komme|aus|aber|also|ich|habe|war|wohne|lebe|i am|i\'m)\b.*$',
                        '', value, flags=re.IGNORECASE).strip()
                    # Remove leading filler like "uebrigens"
                    value = re.sub(
                        r'^(?:[uü]brigens?\w*\s+)', '', value, flags=re.IGNORECASE).strip()
                    if not value:
                        continue

                # Special handling for "beziehung" — normalize
                if category == "beziehung":
                    # "entwickelt" → "Entwickler", "gebaut" → "Erbauer"
                    verb_map = {
                        "entwickelt": "Entwickler",
                        "gebaut": "Erbauer",
                        "programmiert": "Programmierer",
                        "erstellt": "Ersteller",
                        "gemacht": "Ersteller",
                    }
                    value = verb_map.get(value.lower(), value.capitalize())

                # Validate value
                if len(value) < 1 or len(value) > 120:
                    continue

                # Clean up trailing prepositions/articles
                value = re.sub(r'\s+(?:der|die|das|von|aus|in|zu|am|im|an)\s*$', '', value).strip()
                if not value:
                    continue

                # Store
                if category in _LIST_CATEGORIES:
                    existing = self._facts.get(category, "")
                    if value.lower() not in existing.lower():
                        new_val = (existing + ", " + value).strip(", ")
                        self._facts[category] = new_val
                        found.append((category, value))
                        seen_categories.add(category)
                else:
                    if self._facts.get(category) != value:
                        self._facts[category] = value
                        found.append((category, value))
                        seen_categories.add(category)

        if found:
            self._save()
            for cat, val in found:
                log.info("Memory: %s = %s", cat, val)

        return found

    # ── Remember-Intent Patterns ─────────────────────────────────────────────
    _REMEMBER_TRIGGERS = re.compile(
        r'(?:'
        # "bitte merk dir [das] ...", "bitte merke dir ...", "bitte speichere ..."
        r'bitte\s+(?:merk(?:e(?:st)?)?|erinnere?(?:\s+dich)?(?:\s+daran)?|speichere?)\w*'
        r'(?:\s+dir)?(?:\s+das)?(?:\s*[,!:\s]\s*|\s+)(?:dass\s+)?|'
        # "merk dir: ..." / "merke dir ..." — forward declaration
        r'(?:merk(?:e(?:st)?)?)\s+dir\s*[:,!]?\s*|'
        # "vergiss nicht[,] ..." / "vergiss das nicht"
        r'vergiss\s+(?:das\s+)?nicht\s*[,!]?\s*|'
        # English
        r'please\s+remember\s+(?:that\s+)?|'
        r'remember\s+(?:that\s+)?'
        r')',
        re.IGNORECASE,
    )
    # "..., bitte merk dir DAS / DEN / DIE / ES" — retroactive reference
    _REMEMBER_RETROACTIVE = re.compile(
        r'^(.+?)\s*[,!]\s*bitte\s+(?:merk|merke|erinner|speicher)\s+(?:dir\s+)?'
        r'(?:das|den|die|es|sich\s+das|sich\s+es)\s*[.!]?\s*$',
        re.IGNORECASE | re.DOTALL,
    )

    def detect_remember_intent(self, text: str) -> str | None:
        """Detect explicit 'please remember X' requests.

        Returns the note to save, or None if no intent detected.
        Handles:
          - "Merk dir: Ich habe einen Kollegen namens Jenö"
          - "Ich habe einen Kollegen namens Jenö, bitte merk dir das"
          - "Vergiss nicht, meine Katze heißt Minka"
          - "Please remember that my sister's name is Lea"
        """
        text = text.strip()

        # 1) Retroactive: "[fact], bitte merk dir das/den/die/es"
        m = self._REMEMBER_RETROACTIVE.search(text)
        if m:
            note = m.group(1).strip().strip(",").strip()
            if len(note) > 3:
                return note

        # 2) Forward: "Merk dir: [fact]" / "bitte merk dir, dass ..."
        trigger_m = self._REMEMBER_TRIGGERS.search(text)
        if trigger_m:
            # Everything after the trigger phrase
            after = text[trigger_m.end():].strip().lstrip(",: ").strip()
            # Strip trailing period / exclamation
            after = after.rstrip(".!")
            if len(after) > 3:
                return after

        return None

    def process_message(self, role: str, content: str):
        """Process a message and extract any facts from user messages."""
        if role != "user":
            return
        # 1) Regex-based structured fact extraction (name, age, city, …)
        self.extract_facts(content)
        # 2) Explicit "remember this" intent → custom note
        note = self.detect_remember_intent(content)
        if note and note not in self._custom:
            self._custom.append(note)
            self._save()
            log.info("Memory (custom note): %s", note)

    def add_fact(self, category: str, value: str):
        """Manually add a fact."""
        self._facts[category] = value
        self._save()

    def add_custom(self, note: str):
        """Add a custom memory note."""
        if note not in self._custom:
            self._custom.append(note)
            self._save()

    def remove_fact(self, category: str) -> bool:
        if category in self._facts:
            del self._facts[category]
            self._save()
            return True
        return False

    def remove_custom(self, index: int) -> bool:
        if 0 <= index < len(self._custom):
            self._custom.pop(index)
            self._save()
            return True
        return False

    def build_context(self) -> str:
        """Build a context string for the system prompt."""
        if not self._facts and not self._custom:
            return ""

        parts = []
        if self._facts:
            parts.append("Bekannte Informationen ueber den Nutzer:")
            for cat, val in self._facts.items():
                label = _CATEGORY_LABELS.get(cat, cat.capitalize())
                parts.append(f"  - {label}: {val}")

        if self._custom:
            parts.append("Notizen:")
            for note in self._custom:
                parts.append(f"  - {note}")

        return "\n".join(parts)

    @property
    def all_facts(self) -> dict:
        return dict(self._facts)

    @property
    def all_custom(self) -> list[str]:
        return list(self._custom)

    @property
    def stats(self) -> dict:
        return {
            "facts": len(self._facts),
            "custom": len(self._custom),
            "updated": self._updated,
        }
