/*
 * Copyright © 2018 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "brw_nir.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_math.h"
#include "util/bitscan.h"

static nir_ssa_def *
dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
                  nir_ssa_def *store_src, int offset,
                  unsigned num_components, unsigned bit_size,
                  unsigned align)
{
   const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];

   nir_intrinsic_instr *dup =
      nir_intrinsic_instr_create(b->shader, intrin->intrinsic);

   nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
   for (unsigned i = 0; i < info->num_srcs; i++) {
      assert(intrin->src[i].is_ssa);
      if (i == 0 && store_src) {
         assert(!info->has_dest);
         assert(&intrin->src[i] != intrin_offset_src);
         dup->src[i] = nir_src_for_ssa(store_src);
      } else if (&intrin->src[i] == intrin_offset_src) {
         dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
                                                       offset));
      } else {
         dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
      }
   }

   dup->num_components = num_components;

   for (unsigned i = 0; i < info->num_indices; i++)
      dup->const_index[i] = intrin->const_index[i];

   nir_intrinsic_set_align(dup, align, 0);

   if (info->has_dest) {
      assert(intrin->dest.is_ssa);
      nir_ssa_dest_init(&dup->instr, &dup->dest,
                        num_components, bit_size,
                        intrin->dest.ssa.name);
   } else {
      nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
   }

   nir_builder_instr_insert(b, &dup->instr);

   return info->has_dest ? &dup->dest.ssa : NULL;
}

static bool
lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin)
{
   assert(intrin->dest.is_ssa);
   if (intrin->dest.ssa.bit_size == 32)
      return false;

   const unsigned bit_size = intrin->dest.ssa.bit_size;
   const unsigned num_components = intrin->dest.ssa.num_components;
   const unsigned bytes_read = num_components * (bit_size / 8);
   const unsigned align = nir_intrinsic_align(intrin);

   nir_ssa_def *result[NIR_MAX_VEC_COMPONENTS] = { NULL, };

   nir_src *offset_src = nir_get_io_offset_src(intrin);
   if (bit_size < 32 && nir_src_is_const(*offset_src)) {
      /* The offset is constant so we can use a 32-bit load and just shift it
       * around as needed.
       */
      const int load_offset = nir_src_as_uint(*offset_src) % 4;
      assert(load_offset % (bit_size / 8) == 0);
      const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
      /* A 16-bit vec4 is a 32-bit vec2.  We add an extra component in case
       * we offset into a component with load_offset.
       */
      assert(load_comps32 <= 3);

      nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
                                            load_comps32, 32, 4);
      nir_ssa_def *unpacked[3];
      for (unsigned i = 0; i < load_comps32; i++)
         unpacked[i] = nir_unpack_bits(b, nir_channel(b, load, i), bit_size);

      assert(load_offset % (bit_size / 8) == 0);
      const unsigned divisor = 32 / bit_size;

      for (unsigned i = 0; i < num_components; i++) {
         unsigned load_i = i + load_offset / (bit_size / 8);
         result[i] = nir_channel(b, unpacked[load_i / divisor],
                                    load_i % divisor);
      }
   } else {
      /* Otherwise, we have to break it into smaller loads */
      unsigned res_idx = 0;
      int load_offset = 0;
      while (load_offset < bytes_read) {
         const unsigned bytes_left = bytes_read - load_offset;
         unsigned load_bit_size, load_comps;
         if (align < 4) {
            load_comps = 1;
            /* Choose a byte, word, or dword */
            load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
         } else {
            assert(load_offset % 4 == 0);
            load_bit_size = 32;
            load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
         }

         nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, load_offset,
                                               load_comps, load_bit_size,
                                               align);

         nir_ssa_def *unpacked = nir_bitcast_vector(b, load, bit_size);
         for (unsigned i = 0; i < unpacked->num_components; i++) {
            if (res_idx < num_components)
               result[res_idx++] = nir_channel(b, unpacked, i);
         }

         load_offset += load_comps * (load_bit_size / 8);
      }
   }

   nir_ssa_def *vec_result = nir_vec(b, result, num_components);
   nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                            nir_src_for_ssa(vec_result));
   nir_instr_remove(&intrin->instr);

   return true;
}

static bool
lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin)
{
   assert(intrin->src[0].is_ssa);
   nir_ssa_def *value = intrin->src[0].ssa;

   assert(intrin->num_components == value->num_components);
   const unsigned bit_size = value->bit_size;
   const unsigned num_components = intrin->num_components;
   const unsigned bytes_written = num_components * (bit_size / 8);
   const unsigned align_mul = nir_intrinsic_align_mul(intrin);
   const unsigned align_offset = nir_intrinsic_align_offset(intrin);
   const unsigned align = nir_intrinsic_align(intrin);

   nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
   assert(writemask < (1 << num_components));

   if ((value->bit_size <= 32 && num_components == 1) ||
       (value->bit_size == 32 && writemask == (1 << num_components) - 1))
      return false;

   nir_src *offset_src = nir_get_io_offset_src(intrin);
   const bool offset_is_const = nir_src_is_const(*offset_src);
   const unsigned const_offset =
      offset_is_const ? nir_src_as_uint(*offset_src) : 0;

   assert(num_components * (bit_size / 8) <= 32);
   uint32_t byte_mask = 0;
   for (unsigned i = 0; i < num_components; i++) {
      if (writemask & (1 << i))
         byte_mask |= ((1 << (bit_size / 8)) - 1) << i * (bit_size / 8);
   }

   while (byte_mask) {
      const int start = ffs(byte_mask) - 1;
      assert(start % (bit_size / 8) == 0);

      int end;
      for (end = start + 1; end < bytes_written; end++) {
         if (!(byte_mask & (1 << end)))
            break;
      }
      /* The size of the current contiguous chunk in bytes */
      const unsigned chunk_bytes = end - start;

      const bool is_dword_aligned =
         (align_mul >= 4 && (align_offset + start) % 4 == 0) ||
         (offset_is_const && (start + const_offset) % 4 == 0);

      unsigned store_comps, store_bit_size, store_align;
      if (chunk_bytes >= 4 && is_dword_aligned) {
         store_align = MAX2(align, 4);
         store_bit_size = 32;
         store_comps = MIN2(chunk_bytes, 16) / 4;
      } else {
         store_align = align;
         store_comps = 1;
         store_bit_size = MIN2(chunk_bytes, 4) * 8;
         /* The bit size must be a power of two */
         if (store_bit_size == 24)
            store_bit_size = 16;
      }

      const unsigned store_bytes = store_comps * (store_bit_size / 8);
      assert(store_bytes % (bit_size / 8) == 0);
      const unsigned store_first_src_comp = start / (bit_size / 8);
      const unsigned store_src_comps = store_bytes / (bit_size / 8);
      assert(store_first_src_comp + store_src_comps <= num_components);

      unsigned src_swiz[4] = { 0, };
      for (unsigned i = 0; i < store_src_comps; i++)
         src_swiz[i] = store_first_src_comp + i;
      nir_ssa_def *store_value =
         nir_swizzle(b, value, src_swiz, store_src_comps, false);
      nir_ssa_def *packed = nir_bitcast_vector(b, store_value, store_bit_size);

      dup_mem_intrinsic(b, intrin, packed, start,
                        store_comps, store_bit_size, store_align);

      byte_mask &= ~(((1u << store_bytes) - 1) << start);
   }

   nir_instr_remove(&intrin->instr);

   return true;
}

static bool
lower_mem_access_bit_sizes_impl(nir_function_impl *impl)
{
   bool progress = false;

   nir_builder b;
   nir_builder_init(&b, impl);

   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
         if (instr->type != nir_instr_type_intrinsic)
            continue;

         b.cursor = nir_after_instr(instr);

         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
         switch (intrin->intrinsic) {
         case nir_intrinsic_load_global:
         case nir_intrinsic_load_ssbo:
         case nir_intrinsic_load_shared:
            if (lower_mem_load_bit_size(&b, intrin))
               progress = true;
            break;

         case nir_intrinsic_store_global:
         case nir_intrinsic_store_ssbo:
         case nir_intrinsic_store_shared:
            if (lower_mem_store_bit_size(&b, intrin))
               progress = true;
            break;

         default:
            break;
         }
      }
   }

   if (progress) {
      nir_metadata_preserve(impl, nir_metadata_block_index |
                                  nir_metadata_dominance);
   }

   return progress;
}

/**
 * This pass loads arbitrary SSBO and shared memory load/store operations to
 * intrinsics which are natively handleable by GEN hardware.  In particular,
 * we have two general types of memory load/store messages:
 *
 *  - Untyped surface read/write:  These can load/store between one and four
 *    dword components to/from a dword-aligned offset.
 *
 *  - Byte scattered read/write:  These can load/store a single byte, word, or
 *    dword scalar to/from an unaligned byte offset.
 *
 * Neither type of message can do a write-masked store.  This pass converts
 * all nir load/store intrinsics into a series of either 8 or 32-bit
 * load/store intrinsics with a number of components that we can directly
 * handle in hardware and with a trivial write-mask.
 */
bool
brw_nir_lower_mem_access_bit_sizes(nir_shader *shader)
{
   bool progress = false;

   nir_foreach_function(func, shader) {
      if (func->impl && lower_mem_access_bit_sizes_impl(func->impl))
         progress = true;
   }

   return progress;
}