#!/usr/bin/env python3 # -*- coding: utf-8 -*- import cv2 import time import atexit import threading import shutil import subprocess import tempfile import os import queue import json from config import * from io import BytesIO from flask import Flask, Response, render_template, send_file, abort, request, jsonify ############################################################################### # Caméra / Flask / Audio ############################################################################### def _cmd_ok(cmd, **kwargs): try: return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True, **kwargs) except Exception: return None def audio_backend_detect(): if AUDIO_BACKEND in ('pactl', 'alsa'): return AUDIO_BACKEND if shutil.which('pactl'): info = _cmd_ok(['pactl', 'info']) if info and 'Server Name' in info.stdout: return 'pactl' if shutil.which('amixer'): return 'alsa' return 'none' def get_volume(): backend = audio_backend_detect() if backend == 'pactl': info = _cmd_ok(['pactl', 'get-sink-volume', '@DEFAULT_SINK@']) mute = _cmd_ok(['pactl', 'get-sink-mute', '@DEFAULT_SINK@']) if info and mute: line = info.stdout.strip().splitlines()[-1] perc = 0 for tok in line.replace('%', ' % ').split(): t = tok.strip().rstrip('%') if t.isdigit(): val = int(t) if 0 <= val <= 200: perc = val break muted = 'yes' in mute.stdout.lower() return perc, muted, 'pactl' elif backend == 'alsa': out = _cmd_ok(['amixer', '-c', ALSA_CARD, 'get', ALSA_MIXER]) if out: import re txt = out.stdout m = re.search(r'\[(\d{1,3})%\]', txt) perc = int(m.group(1)) if m else 0 muted = '[off]' in txt return perc, muted, 'alsa' return 0, False, 'none' def set_volume(level): try: level = int(level) except Exception: return False level = max(0, min(level, 100)) backend = audio_backend_detect() if backend == 'pactl': _cmd_ok(['pactl', 'set-sink-volume', '@DEFAULT_SINK@', f'{level}%']) return True elif backend == 'alsa': _cmd_ok(['amixer', '-c', ALSA_CARD, 'set', ALSA_MIXER, f'{level}%']) return True return False def change_volume(delta_percent): vol, muted, _ = get_volume() target = max(0, min(vol + int(delta_percent), 150)) return set_volume(target) def toggle_mute(force_state=None): backend = audio_backend_detect() if backend == 'pactl': if force_state is None: _cmd_ok(['pactl', 'set-sink-mute', '@DEFAULT_SINK@', 'toggle']) else: _cmd_ok(['pactl', 'set-sink-mute', '@DEFAULT_SINK@', '1' if force_state else '0']) return True elif backend == 'alsa': if force_state is None: _cmd_ok(['amixer', '-c', ALSA_CARD, 'set', ALSA_MIXER, 'toggle']) else: _cmd_ok(['amixer', '-c', ALSA_CARD, 'set', ALSA_MIXER, 'mute' if force_state else 'unmute']) return True return False ############################################################################### # Détection d’objets (YOLOv8 par défaut, fallback MobileNet-SSD) ############################################################################### class ObjectDetector: def __init__(self, backend="yolo", conf=0.3, img_size=640): self.backend = backend self.conf = conf self.img_size = img_size self.ready = False try: from ultralytics import YOLO self.model = YOLO("yolov8n.pt") self.names = self.model.names self.ready = True except Exception as e: print("[DETECT] YOLO init error:", e) def annotate(self, frame_bgr): if not self.ready: return frame_bgr h, w = frame_bgr.shape[:2] rlist = self.model.predict(source=frame_bgr, imgsz=self.img_size, conf=self.conf, verbose=False) if rlist: r = rlist[0] names = self.names if hasattr(self, "names") else {} for b in r.boxes: x1, y1, x2, y2 = map(int, b.xyxy[0].tolist()) score = float(b.conf[0]) cls = int(b.cls[0]) label = f"{names.get(cls, str(cls))}" # label = f"{names.get(cls, str(cls))} {score:.2f}" cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (255,0,0), 1) (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) cv2.rectangle(frame_bgr, (x1, max(0, y1-th-6)), (x1+tw+8, y1), (255,0,0), -1) cv2.putText(frame_bgr, label, (x1+4, y1-6), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 1, cv2.LINE_AA) return frame_bgr ############################################################################### # Caméra (thread capture + thread inférence) ############################################################################### class Camera: def __init__(self, index=0, backend=None, width=None, height=None): self.cap = cv2.VideoCapture(index, backend or cv2.CAP_ANY) if not self.cap.isOpened(): raise RuntimeError(f"Impossible d’ouvrir la caméra index={index}") if width and height: self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, width) self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height) self.detect_enabled = USE_DETECTION_DEFAULT self.show_fps = SHOW_FPS_DEFAULT self.detector = ObjectDetector(DETECTOR_BACKEND, CONF_THRES, IMG_SIZE) self.lock = threading.Lock() self.frame = None self.annotated = None self.cam_fps = 0.0 self.inf_fps = 0.0 self._last_cam_t = 0.0 self.running = True self.reader_t = threading.Thread(target=self._reader, daemon=True) self.reader_t.start() self.infer_interval = 1.0 / max(1, INFER_FPS) self.infer_t = threading.Thread(target=self._infer_loop, daemon=True) self.infer_t.start() def _ema(self, old, value, alpha=0.2): return value if old == 0.0 else (alpha*value + (1-alpha)*old) def _draw_fps_overlay(self, img): if not self.show_fps: return text = f"CAM {self.cam_fps:.1f} FPS" if self.detect_enabled and self.detector and self.detector.ready: text += f" | INF {self.inf_fps:.1f} FPS" cv2.putText(img, text, (10,22), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,0), 3, cv2.LINE_AA) cv2.putText(img, text, (10,22), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA) def _reader(self): while self.running: ret, frame = self.cap.read() if not ret: time.sleep(0.01); continue now = time.time() if self._last_cam_t != 0.0: inst = 1.0/max(1e-6, now-self._last_cam_t) self.cam_fps = self._ema(self.cam_fps, inst) self._last_cam_t = now with self.lock: self.frame = frame def _infer_loop(self): last = 0.0 while self.running: now = time.time() if now - last < self.infer_interval: time.sleep(0.003); continue last = now with self.lock: frame = None if self.frame is None else self.frame.copy() detect_on = self.detect_enabled if frame is None: continue t0 = time.time() out = self.detector.annotate(frame) if (detect_on and self.detector and self.detector.ready) else frame self._draw_fps_overlay(out) ok, buf = cv2.imencode(".jpg", out, [cv2.IMWRITE_JPEG_QUALITY, 85]) if ok: with self.lock: self.annotated = buf.tobytes() dt = time.time()-t0 if dt>0: self.inf_fps = self._ema(self.inf_fps, 1.0/dt) def get_jpeg(self, quality=85): with self.lock: if self.annotated is not None: return self.annotated if self.frame is None: return None raw = self.frame.copy() if self.show_fps: self._draw_fps_overlay(raw) ok, buf = cv2.imencode(".jpg", raw, [cv2.IMWRITE_JPEG_QUALITY, quality]) return buf.tobytes() if ok else None def set_detect_enabled(self, state: bool): with self.lock: self.detect_enabled = bool(state) def set_show_fps(self, state: bool): with self.lock: self.show_fps = bool(state) def get_stats(self): with self.lock: return { "detect_enabled": self.detect_enabled, "show_fps": self.show_fps, "backend": DETECTOR_BACKEND if (self.detector and self.detector.ready) else "none", "cam_fps": round(self.cam_fps, 2), "infer_fps": round(self.inf_fps, 2) if (self.detector and self.detector.ready and self.detect_enabled) else 0.0 } def release(self): self.running = False try: self.infer_t.join(timeout=1) self.reader_t.join(timeout=1) except Exception: pass self.cap.release() camera = Camera(index=CAMERA_INDEX, backend=CAPTURE_BACKEND, width=WIDTH, height=HEIGHT) atexit.register(camera.release) ############################################################################### # TTS (Piper → espeak → spd-say) ############################################################################### def tts_backend_detect(): if shutil.which("piper") and os.path.exists(PIPER_MODEL): return "piper" if shutil.which("espeak"): return "espeak" if shutil.which("spd-say"): return "spd-say" return "none" def speak_text(text): # str.trim n'existe pas en Python, on garde une compat simple : text = (text or "").strip() if not text: return False, "none", "texte vide" if len(text) > 1000: text = text[:1000] ab = audio_backend_detect() ttsb = tts_backend_detect() try: if ttsb == "piper": with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: wav_path = tmp.name p = _cmd_ok(["piper", "--model", PIPER_MODEL, "--output_file", wav_path], input=text) if not p: return False, ttsb, "échec piper synthèse" if ab == "pactl" and shutil.which("paplay"): play = _cmd_ok(["paplay", wav_path]) elif shutil.which("aplay"): play = _cmd_ok(["aplay", "-q", wav_path]) else: return False, ttsb, "lecteur (paplay/aplay) introuvable" try: os.unlink(wav_path) except Exception: pass return (play is not None), ttsb, None if play else "lecture échouée" elif ttsb == "espeak": args = ["espeak", "-v", "fr", text] if ab == "pactl" and shutil.which("padsp"): args = ["padsp"] + args ok = _cmd_ok(args) is not None return ok, ttsb, None if ok else "espeak a échoué" elif ttsb == "spd-say": ok = _cmd_ok(["spd-say", "-l", "fr", text]) is not None return ok, ttsb, None if ok else "spd-say a échoué" else: return False, "none", "aucun backend TTS disponible" except Exception as e: return False, ttsb, str(e) ############################################################################### # STT (Vosk – micro → texte) ############################################################################### # Chemin MODELE Vosk (intégré selon ta demande) stt_lock = threading.Lock() stt_listening = False stt_thread = None stt_partial = "" stt_last_final = "" stt_log = [] stt_err = None def _stt_worker(): global stt_partial, stt_last_final, stt_listening, stt_err try: from vosk import Model, KaldiRecognizer import sounddevice as sd except Exception as e: stt_err = f"Import error: {e}" with stt_lock: stt_listening = False return if not os.path.exists(VOSK_MODEL_PATH): stt_err = f"Modèle Vosk introuvable: {VOSK_MODEL_PATH}" with stt_lock: stt_listening = False return try: model = Model(VOSK_MODEL_PATH) rec = KaldiRecognizer(model, VOSK_SAMPLE_RATE) except Exception as e: stt_err = f"Init Vosk error: {e}" with stt_lock: stt_listening = False return stt_err = None q = queue.Queue() def audio_cb(indata, frames, time_info, status): if status: pass q.put(bytes(indata)) try: with sd.RawInputStream(samplerate=VOSK_SAMPLE_RATE, blocksize=8000, dtype='int16', channels=1, callback=audio_cb): while True: with stt_lock: if not stt_listening: break data = q.get() if rec.AcceptWaveform(data): res = json.loads(rec.Result() or "{}") txt = (res.get("text") or "").strip() if txt: stt_last_final = txt stt_log.append(txt) if len(stt_log) > 20: stt_log.pop(0) stt_partial = "" else: pres = json.loads(rec.PartialResult() or "{}") stt_partial = pres.get("partial", "") except Exception as e: stt_err = f"Stream error: {e}" finally: with stt_lock: stt_listening = False def stt_start(): global stt_listening, stt_thread, stt_err with stt_lock: if stt_listening: return True, None stt_listening = True stt_err = None th = threading.Thread(target=_stt_worker, daemon=True) th.start() stt_thread = th return True, None def stt_stop(): global stt_listening, stt_thread with stt_lock: stt_listening = False if stt_thread: stt_thread.join(timeout=1.5) stt_thread = None return True def stt_status(): with stt_lock: return { "listening": stt_listening, "partial": stt_partial, "last_final": stt_last_final, "error": stt_err, "model": VOSK_MODEL_PATH if os.path.exists(VOSK_MODEL_PATH) else "(absent)" } ############################################################################### # Flask ############################################################################### app = Flask(__name__) @app.route("/") def index(): vol, muted, backend = get_volume() stats = camera.get_stats() return render_template( "index.html", initial_volume=vol, initial_muted=muted, audio_backend=backend, alsa_card=ALSA_CARD, alsa_mixer=ALSA_MIXER, detect_enabled=stats["detect_enabled"], show_fps=stats["show_fps"], detector_backend=stats["backend"], tts_backend=tts_backend_detect(), piper_model=os.path.basename(PIPER_MODEL) if os.path.exists(PIPER_MODEL) else "(aucun)", stt_model=os.path.basename(VOSK_MODEL_PATH) if os.path.exists(VOSK_MODEL_PATH) else "(absent)" ) # --------- Flux vidéo def mjpeg_generator(): frame_interval = (1.0 / FPS_LIMIT) if FPS_LIMIT and FPS_LIMIT > 0 else 0.0 last_time = 0.0 while True: if frame_interval: now = time.time() delta = now - last_time if delta < frame_interval: time.sleep(frame_interval - delta) last_time = time.time() jpg = camera.get_jpeg() if jpg is None: time.sleep(0.02); continue yield (b"--frame\r\n" b"Content-Type: image/jpeg\r\n" b"Content-Length: " + str(len(jpg)).encode() + b"\r\n\r\n" + jpg + b"\r\n") @app.route("/video_feed") def video_feed(): return Response(mjpeg_generator(), mimetype="multipart/x-mixed-replace; boundary=frame") @app.route("/snapshot") def snapshot(): jpg = camera.get_jpeg(quality=95) if jpg is None: abort(503, description="Aucune image disponible") return send_file(BytesIO(jpg), mimetype="image/jpeg", as_attachment=False, download_name="snapshot.jpg") # --------- Volume @app.get("/volume") def api_get_volume(): vol, muted, backend = get_volume() return jsonify({"volume": vol, "muted": muted, "backend": backend}) @app.post("/volume/set") def api_set_volume(): level = request.args.get("level") if level is None: return jsonify({"ok": False, "error": "level manquant"}), 400 ok = set_volume(level) vol, muted, backend = get_volume() return jsonify({"ok": ok, "volume": vol, "muted": muted, "backend": backend}) @app.post("/volume/up") def api_volume_up(): ok = change_volume(+VOLUME_STEP) vol, muted, backend = get_volume() return jsonify({"ok": ok, "volume": vol, "muted": muted, "backend": backend}) @app.post("/volume/down") def api_volume_down(): ok = change_volume(-VOLUME_STEP) vol, muted, backend = get_volume() return jsonify({"ok": ok, "volume": vol, "muted": muted, "backend": backend}) @app.post("/volume/mute") def api_volume_mute(): state = request.args.get("state") force = None if state in ("on","true","1"): force = True elif state in ("off","false","0"): force = False ok = toggle_mute(force) vol, muted, backend = get_volume() return jsonify({"ok": ok, "volume": vol, "muted": muted, "backend": backend}) # --------- Détection / FPS @app.get("/stats") def api_stats(): return jsonify(camera.get_stats()) @app.post("/detect/toggle") def api_detect_toggle(): st = camera.get_stats() camera.set_detect_enabled(not st["detect_enabled"]) return jsonify({"ok": True, **camera.get_stats()}) @app.post("/fps/show") def api_fps_show(): state = request.args.get("state") if state is None: return jsonify({"ok": False, "error": "state manquant (true/false)"}), 400 val = state.lower() in ("1","true","on","yes") camera.set_show_fps(val) return jsonify({"ok": True, **camera.get_stats()}) # --------- TTS @app.post("/tts/say") def api_tts_say(): data = request.get_json(silent=True) or {} text = data.get("text") or request.form.get("text") or "" ok, backend, err = speak_text(text) return jsonify({"ok": ok, "backend": backend, "error": err}) # --------- STT (Vosk) @app.get("/stt/status") def api_stt_status(): return jsonify(stt_status()) @app.post("/stt/start") def api_stt_start(): ok, err = stt_start() st = stt_status() st["ok"] = ok if err: st["error"] = err return jsonify(st) @app.post("/stt/stop") def api_stt_stop(): ok = stt_stop() st = stt_status() st["ok"] = ok return jsonify(st) if __name__ == "__main__": app.run(host="0.0.0.0", port=5000, threaded=True)