Added audio generation

2026-02-11 20:58:35 +01:00
parent 149a142c22
commit 11ad746f04
15 changed files with 267 additions and 0 deletions
--- a/firmware/tools/audio/.gitattributes
+++ b/firmware/tools/audio/.gitattributes
@@ -0,0 +1 @@
+samples/ filter=lfs diff=lfs merge=lfs -text
--- a/firmware/tools/audio/.gitignore
+++ b/firmware/tools/audio/.gitignore
@@ -0,0 +1,2 @@
+cache
+previews
--- a/firmware/tools/audio/build_audio.py
+++ b/firmware/tools/audio/build_audio.py
@@ -0,0 +1,198 @@
+import os
+import yaml
+import hashlib
+import shutil
+import subprocess
+from pathlib import Path
+from kokoro import KPipeline
+import soundfile as sf
+
+class AudioBuilder:
+    def __init__(self):
+        # 1. Defaults
+        self.config = {
+            'paths': {
+                'cache': Path("./cache"),
+                'output': Path("./lfs_source"),
+                'preview': Path("./previews")
+            },
+            'settings': { 'sample_rate': 16000 }
+        }
+        
+        # 2. Global YAML laden (überschreibt Defaults)
+        self.load_global_config()
+        
+        # Pfade sicherstellen
+        for p in self.config['paths'].values():
+            Path(p).mkdir(parents=True, exist_ok=True)
+            
+        self.pipeline = KPipeline(lang_code='a')
+
+    def load_global_config(self):
+        if Path("global.yaml").exists():
+            with open("global.yaml", "r") as f:
+                g_cfg = yaml.safe_load(f)
+                
+                # Pfade überschreiben und sofort in Path-Objekte wandeln
+                if 'paths' in g_cfg:
+                    for key, value in g_cfg['paths'].items():
+                        self.config['paths'][key] = Path(value)
+                
+                # Settings (wie Sample Rate) überschreiben
+                if 'settings' in g_cfg:
+                    self.config['settings'].update(g_cfg['settings'])
+        
+        # Sicherstellen, dass der Standard-Sample-Pfad existiert, falls nicht in global.yaml
+        if 'samples' not in self.config['paths']:
+            self.config['paths']['samples'] = Path("./samples")
+
+    def get_hash(self, text, voice, filters, sample_rate):
+            """Erzeugt einen MD5-Hash über alle Parameter, die das Audio-Ergebnis beeinflussen."""
+            # Wir kombinieren alle Parameter zu einem String
+            data = f"{text}{voice}{''.join(filters)}{sample_rate}"
+            return hashlib.md5(data.encode()).hexdigest()
+
+    def run_ffmpeg(self, cmd):
+        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+
+    def process_asset(self, asset, default_voice, global_filters):
+        # 1. Parameter vorbereiten
+        asset_id = asset['id']
+        sr_val = self.config['settings']['sample_rate']
+        sr_str = str(sr_val)
+        filters = asset.get('filters', global_filters)
+        
+        # Weiche: Ist es ein lokales Sample oder KI-Sprache?
+        is_sample = asset.get('type') == 'sample'
+        
+        if is_sample:
+            # Pfad aus global.yaml nutzen
+            source_file = self.config['paths']['samples'] / asset['source']
+            # Hash basiert auf Dateiname + Filter + SR
+            h = self.get_hash(asset['source'], "LOCAL_SAMPLE", filters, sr_str)
+            input_for_ffmpeg = str(source_file)
+        else:
+            # Klassische KI-Sprache
+            voice = asset.get('voice', default_voice)
+            text = asset['text']
+            h = self.get_hash(text, voice, filters, sr_str)
+            # Input wird später das temp_wav sein
+        
+        # Pfade definieren
+        cache_file = self.config['paths']['cache'] / h
+        preview_file = self.config['paths']['preview'] / f"{asset_id}.wav"
+
+        if not cache_file.exists():
+            print(f"  [GEN] {asset_id}...")
+            
+            # 2. Input-Vorbereitung
+            if not is_sample:
+                # KI-Sprache: Temp WAV erzeugen
+                temp_wav = self.config['paths']['cache'] / f"temp_{h}.wav"
+                generator = self.pipeline(text, voice=voice, speed=1.0)
+                for _, _, audio in generator:
+                    sf.write(temp_wav, audio, 24000)
+                    break
+                current_input = str(temp_wav)
+            else:
+                current_input = input_for_ffmpeg
+
+            # 3. Filter-String
+            f_str = ",".join(filters)
+            
+            # 4. RAW-Export für nRF
+            self.run_ffmpeg([
+                'ffmpeg', '-y', '-i', current_input, 
+                '-af', f_str, 
+                '-ar', sr_str, '-ac', '1', 
+                '-f', 's16le', '-acodec', 'pcm_s16le', 
+                str(cache_file)
+            ])
+            
+            # 5. Preview-WAV
+            self.run_ffmpeg([
+                'ffmpeg', '-y', '-i', current_input, 
+                '-af', f_str, 
+                '-ac', '1', 
+                str(preview_file)
+            ])
+            
+            # Aufräumen (nur wenn es ein KI-Temp-File war)
+            if not is_sample and Path(current_input).exists(): 
+                Path(current_input).unlink()
+                
+        return cache_file
+
+    def generate_countdown(self, config):
+        """Erzeugt einen zusammenhängenden 10-Sekunden-Countdown."""
+        c_id = config['id']
+        voice = config['voice']
+        filters = config.get('filters', [])
+        sr_str = str(self.config['settings']['sample_rate'])
+
+        # Eigener Hash für das gesamte Countdown-Objekt
+        h = self.get_hash(f"COUNTDOWN_LOGIC_V2_{c_id}", voice, filters, sr_str)
+        final_cache_file = self.config['paths']['cache'] / f"final_{h}"
+
+        if not final_cache_file.exists():
+            print(f"  [GEN] Spezial-Asset: {c_id} (10 bis 1)")
+            
+            # Die Texte für die Zahlen
+            numbers = [
+                "TEN!", "NINE!", "EIGHT!", "SEVEN!", "SIX!", 
+                "FIVE!", "FOUR!", "THREE!", "TWO!!", "ONE!!!"
+            ]
+            
+            parts = []
+            for i, txt in enumerate(numbers):
+                # Jede Zahl als temporäres Asset durch die Standard-Pipeline jagen
+                # Wir nutzen eine interne ID, um Kollisionen im Preview-Ordner zu vermeiden
+                part_id = f"cnt_tmp_{i}"
+                part_file = self.process_asset(part_id, txt, voice, filters)
+                parts.append(part_file)
+
+            # Binäres Zusammenfügen (Da s16le keinen Header hat)
+            with open(final_cache_file, 'wb') as outfile:
+                for p_file in parts:
+                    with open(p_file, 'rb') as infile:
+                        outfile.write(infile.read())
+            
+            print(f"  [DONE] Countdown-Kette generiert: {final_cache_file.name}")
+            
+        return final_cache_file
+
+    def build_target(self, target_name):
+        print(f"🚀 Baue Assets für Target: {target_name.upper()}")
+        out_dir = self.config['paths']['output']
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        for cfg_file in Path(".").glob("*.yaml"):
+            if cfg_file.name in ["global.yaml", "countdown.yaml"]:
+                continue
+            
+            with open(cfg_file, "r") as f:
+                config = yaml.safe_load(f)
+
+            # Falls das YAML eine Liste ist (z.B. [- id: ...])
+            if isinstance(config, list):
+                assets = config
+                voice = None
+                global_filters = []
+            else:
+                # Falls es ein Dictionary ist (z.B. voice: ... assets: ...)
+                assets = config.get('assets', [])
+                voice = config.get('voice')
+                global_filters = config.get('filters', [])
+            
+            for asset in assets:
+                if target_name in asset.get('targets', []):
+                    source_cache = self.process_asset(asset, voice, global_filters)
+                    if source_cache:
+                        dest_file = out_dir / asset['id']
+                        shutil.copy(source_cache, dest_file)
+
+
+if __name__ == "__main__":
+    import sys
+    target = sys.argv[1] if len(sys.argv) > 1 else "vest"
+    AudioBuilder().build_target(target)
--- a/firmware/tools/audio/countdown.yaml
+++ b/firmware/tools/audio/countdown.yaml
@@ -0,0 +1,14 @@
+id: "countdown"
+voice: "am_michael"
+targets: ["vest", "base"]
+
+# Die Filter hier werden auf jede einzelne Zahl angewendet
+filters:
+  - "lowshelf=f=100:g=20:enable='lt(t,0.1)'"
+  - "asetrate=24000*0.85"
+  - "atempo=1.17"
+  - "acontrast=80"
+  - "aecho=0.8:0.88:60:0.4"
+  - "atrim=end=1"
+  - "apad=whole_dur=1"
+  - "loudnorm=I=-12:TP=-1.0"
--- a/firmware/tools/audio/global.yaml
+++ b/firmware/tools/audio/global.yaml
@@ -0,0 +1,7 @@
+paths:
+  cache: "./cache"
+  output: "../littlefs_generator/source_folder/a"
+  preview: "./previews"
+  samples: "./samples"
+settings:
+  sample_rate: 16000
--- a/firmware/tools/audio/lfs_source/countdown
+++ b/firmware/tools/audio/lfs_source/countdown
--- a/firmware/tools/audio/lfs_source/dead
+++ b/firmware/tools/audio/lfs_source/dead
--- a/firmware/tools/audio/lfs_source/g1
+++ b/firmware/tools/audio/lfs_source/g1
--- a/firmware/tools/audio/lfs_source/game_start
+++ b/firmware/tools/audio/lfs_source/game_start
--- a/firmware/tools/audio/lfs_source/s1
+++ b/firmware/tools/audio/lfs_source/s1
--- a/firmware/tools/audio/requirements.txt
+++ b/firmware/tools/audio/requirements.txt
@@ -0,0 +1,5 @@
+kokoro>=0.1.0        # Die TTS-Pipeline
+soundfile            # Zum Schreiben der WAV-Dateien (sf.write)
+numpy                # Basis für Audio-Daten-Arrays
+torch                # Backend für Kokoro (KI-Modell)
+PyYAML               # Für deine voice_*.yaml Konfigurationsdateien
--- a/firmware/tools/audio/samples/horn.ogg
+++ b/firmware/tools/audio/samples/horn.ogg
--- a/firmware/tools/audio/sfx.yaml
+++ b/firmware/tools/audio/sfx.yaml
@@ -0,0 +1,11 @@
+- id: "game_start"
+  type: "sample"            # Neu: Unterscheidung zwischen TTS und Datei
+  source: "horn.ogg" # Die Datei in deinem samples/ Ordner
+  targets: ["vest", "base"]
+  filters:
+    - "atrim=start=0.15"
+    - "highpass=f=100"
+    - "lowpass=f=6000"
+    - "acompressor=threshold=-8dB:ratio=20:attack=1:release=30" # Maximale "Druckluft"
+    - "amix=inputs=1:weights=1.5" # Sättigung
+    - "loudnorm=I=-12:TP=-1.0"
--- a/firmware/tools/audio/voice_game.yaml
+++ b/firmware/tools/audio/voice_game.yaml
@@ -0,0 +1,17 @@
+voice: "am_michael"
+
+filters:
+  - "asetrate=24000*0.85"
+  - "atempo=1.17"
+  - "acompressor=threshold=-20dB:ratio=4:attack=5:release=50"
+  - "highpass=f=150"
+  - "lowpass=f=4000"
+  - "loudnorm=I=-14:TP=-1.5"
+
+assets:
+  - id: "g1"
+    text: "Welcome to the Arena!"
+    targets: ["vest", "weapon"]
+  - id: "dead"
+    text: "YOU ARE DEAD!"
+    targets: ["vest"]
--- a/firmware/tools/audio/voice_system.yaml
+++ b/firmware/tools/audio/voice_system.yaml
@@ -0,0 +1,12 @@
+voice: "af_bella"
+
+filters:
+  - "highpass=f=200"
+  - "lowpass=f=4500"
+  - "compand=0.3|0.3:1|1:-90/-60|-60/-40|-40/-30|-20/-20:6:0:-90:0.2"
+  - "loudnorm=I=-14:TP=-1.5"
+
+assets:
+  - id: "s1"
+    text: "Network parameters deployed. Connecting to the game leader..."
+    targets: ["vest", "weapon"]
				`@@ -0,0 +1 @@`
				`samples/ filter=lfs diff=lfs merge=lfs -text`