PixinWavICASSP2022

`stegobox.codec.PixinWavICASSP2022`

Bases: BaseCodec

PixInWav targets the multimodal case of hiding images in audio.

To this end, we propose a novel residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. Among our results, we find that the residual steganography setup we propose allows an encoding of the hidden image that is independent from the host audio without compromising quality.

This repository includes a python implemenation of StegoUNet, a deep neural network modelling an audio steganographic function.

Created by: QiuYu
Created time: 2022/11/08

This is a PyTorch implementation of image steganography via deep learning, which is released in paper - Pixinwav: Residual Steganography for Hiding Pixels in Audio.

Originally implemented in margaritageleta/PixInWav.

Source code in stegobox/codec/pixinwav_icassp2022/pixinwav_icassp2022.py

class PixinWavICASSP2022(BaseCodec):
    """PixInWav targets the multimodal case of hiding images in audio.

    To this end, we propose a novel residual architecture operating on top of short-time
    discrete cosine transform (STDCT) audio spectrograms. Among our results, we find
    that the residual steganography setup we propose allows an encoding of the hidden
    image that is independent from the host audio without compromising quality.

    This repository includes a python implemenation of StegoUNet, a deep neural network
    modelling an audio steganographic function.

    * Created by: QiuYu
    * Created time: 2022/11/08

    This is a PyTorch implementation of image steganography via deep learning, which is
    released in paper - [Pixinwav: Residual Steganography for Hiding Pixels in
    Audio](https://ieeexplore.ieee.org/document/9746191/).

    Originally implemented in
    [margaritageleta/PixInWav](https://github.com/margaritageleta/PixInWav).
    """

    def __init__(self, weights: str = "ckpt/pixinwav/checkpoint.pt") -> None:
        super().__init__()
        self.model = StegoUNet(
            # architecture=args.architecture,
            transform="cosine",
            add_noise=False,
            noise_kind=None,
            noise_amplitude=None,
        )

        self.checkpoint = torch.load(weights, map_location="cpu")
        self.model = nn.DataParallel(self.model)  # type: ignore
        self.model.load_state_dict(self.checkpoint["state_dict"], False)
        self.payload = ""
        print("Checkpoint loaded ++")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def imagetransform(
        self, image: Image.Image, colorspace="RGB", n=256, proportion=2**6
    ):
        image2 = image.convert(colorspace)
        nx, ny = image2.size  # type: ignore
        n = min(nx, ny)
        left = top = n / proportion
        right = bottom = (proportion - 1) * n / proportion
        image3 = image2.crop((left, top, right, bottom))  # type: ignore
        image4 = image3.resize((256, 256), Image.ANTIALIAS)  # type: ignore
        image5 = np.array(image4).astype("float") / 255.0
        return image5

    def audiotransform(self, transform, sound):
        _limit = 67522  # 2 ** 16 + 2 ** 11 - 2 ** 6 + 2
        _frame_length = 2**12 if transform == "cosine" else 2**11 - 1
        _frame_step = 2**6 - 2 if transform == "cosine" else 132

        _transform = transform
        if _transform == "fourier":
            stft = STFT(
                filter_length=_frame_length,
                hop_length=_frame_step,
                win_length=_frame_length,
                window="hann",
            )
        sound = sound  # type: ignore
        # Get the samples dimension
        sound = sound[0]
        # Create a temporary array
        tmp = torch.zeros(
            [
                _limit,
            ]
        ).normal_(mean=0, std=0.005)
        # Cut the audio on limit
        if sound.numel() < _limit:
            tmp[: sound.numel()] = sound[:]
        else:
            i = random.randint(0, len(sound) - _limit)
            tmp[:] = sound[i : i + _limit]
        if _transform == "cosine":
            return sdct_torch(
                tmp.type(torch.float32),
                frame_length=_frame_length,
                frame_step=_frame_step,
            )
        elif _transform == "fourier":
            magnitude, phase = stft.transform(  # type: ignore
                tmp.unsqueeze(0).type(torch.float32)
            )
            return magnitude, phase

        else:
            raise Exception("Transform not implemented")

    def encode(self, carrier: torch.Tensor, payload: Image.Image) -> torch.Tensor:
        """Encoder requires carrier audio to be WAV and payload to be a image.

        Args:
            carrier: Carrier audio in format WAV.
                     Read with `stegobox.io.torchaudiofile.read()`.
            payload: Payload (secret message) to be encoded. Payload is a image.
                     Read with `stegobox.io.txt.read_bytes()`

        Returns:
            container_wav: Can be formed to container audio through
                           'io.torchaudiofile.write()' function
        """
        carrier2 = self.audiotransform("cosine", carrier)
        carrier2.to(self.device)  # type: ignore
        payload2 = np.asarray(self.imagetransform(payload, n=256)).astype("float64")
        payload3 = torch.from_numpy(payload2).unsqueeze(0)
        payload4 = (
            payload3.permute(0, 3, 1, 2)
            .type(torch.cuda.FloatTensor)  # type: ignore
            .to(self.device)  # type: ignore
        )
        container, self.payload = self.model(payload4, carrier2)
        container_wav = isdct_torch(
            container.squeeze(0).squeeze(0),
            frame_length=4096,
            frame_step=62,
            window=torch.hamming_window,
        )
        return container_wav

    def decode(self, carrier: torch.Tensor) -> torch.Tensor:
        """Decode the secret payload from the carrier audio

        Args:
            carrier: Carrier audio in format WAV.
                     Read with `stegobox.io.torchaudiofile.read()`.

        Returns:
            reveal: The decoded payload image (secret message).
                Can be formed to image through 'io.image.write_tensor()' function
        """
        carrier2 = self.audiotransform("cosine", carrier)
        carrier3 = carrier2.to(self.device)  # type: ignore
        carrier3 = carrier3.unsqueeze(0)
        carrier3 = carrier3.unsqueeze(0)
        print("xxxxxxxxxx", carrier3.size())
        reveal = self.model.module.RN.forward(carrier3)  # type: ignore
        return reveal

`encode(carrier, payload)`

Encoder requires carrier audio to be WAV and payload to be a image.

Parameters:

Name	Type	Description	Default
`carrier`	`Tensor`	Carrier audio in format WAV. Read with `stegobox.io.torchaudiofile.read()`.	required
`payload`	`Image`	Payload (secret message) to be encoded. Payload is a image. Read with `stegobox.io.txt.read_bytes()`	required

Returns:

Name	Type	Description
`container_wav`	`Tensor`	Can be formed to container audio through 'io.torchaudiofile.write()' function

Source code in stegobox/codec/pixinwav_icassp2022/pixinwav_icassp2022.py

def encode(self, carrier: torch.Tensor, payload: Image.Image) -> torch.Tensor:
    """Encoder requires carrier audio to be WAV and payload to be a image.

    Args:
        carrier: Carrier audio in format WAV.
                 Read with `stegobox.io.torchaudiofile.read()`.
        payload: Payload (secret message) to be encoded. Payload is a image.
                 Read with `stegobox.io.txt.read_bytes()`

    Returns:
        container_wav: Can be formed to container audio through
                       'io.torchaudiofile.write()' function
    """
    carrier2 = self.audiotransform("cosine", carrier)
    carrier2.to(self.device)  # type: ignore
    payload2 = np.asarray(self.imagetransform(payload, n=256)).astype("float64")
    payload3 = torch.from_numpy(payload2).unsqueeze(0)
    payload4 = (
        payload3.permute(0, 3, 1, 2)
        .type(torch.cuda.FloatTensor)  # type: ignore
        .to(self.device)  # type: ignore
    )
    container, self.payload = self.model(payload4, carrier2)
    container_wav = isdct_torch(
        container.squeeze(0).squeeze(0),
        frame_length=4096,
        frame_step=62,
        window=torch.hamming_window,
    )
    return container_wav

`decode(carrier)`

Decode the secret payload from the carrier audio

Parameters:

Name	Type	Description	Default
`carrier`	`Tensor`	Carrier audio in format WAV. Read with `stegobox.io.torchaudiofile.read()`.	required

Returns:

Name	Type	Description
`reveal`	`Tensor`	The decoded payload image (secret message). Can be formed to image through 'io.image.write_tensor()' function

Source code in stegobox/codec/pixinwav_icassp2022/pixinwav_icassp2022.py

def decode(self, carrier: torch.Tensor) -> torch.Tensor:
    """Decode the secret payload from the carrier audio

    Args:
        carrier: Carrier audio in format WAV.
                 Read with `stegobox.io.torchaudiofile.read()`.

    Returns:
        reveal: The decoded payload image (secret message).
            Can be formed to image through 'io.image.write_tensor()' function
    """
    carrier2 = self.audiotransform("cosine", carrier)
    carrier3 = carrier2.to(self.device)  # type: ignore
    carrier3 = carrier3.unsqueeze(0)
    carrier3 = carrier3.unsqueeze(0)
    print("xxxxxxxxxx", carrier3.size())
    reveal = self.model.module.RN.forward(carrier3)  # type: ignore
    return reveal