123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- From 9bb867075fc4b0b7efc1640dc4cdd5b70b482ff1 Mon Sep 17 00:00:00 2001
- From: Jonas Pfeil <pfeiljonas@gmx.de>
- Date: Tue, 8 Nov 2016 00:18:39 +0100
- Subject: [PATCH] drm/vc4: Add fragment shader threading support
- FS threading brings performance improvements of 0-20% in glmark2.
- The validation code checks for thread switch signals and ensures that
- the registers of the other thread are not touched, and that our clamps
- are not live across thread switches. It also checks that the
- threading and branching instructions do not interfere.
- (Original patch by Jonas, changes by anholt for style cleanup,
- removing validation the kernel doesn't need to do, and adding the flag
- for userspace).
- v2: Minor style fixes from checkpatch.
- Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
- Signed-off-by: Eric Anholt <eric@anholt.net>
- (cherry picked from commit c778cc5df944291dcdb1ca7a6bb781fbc22550c5)
- ---
- drivers/gpu/drm/vc4/vc4_drv.c | 1 +
- drivers/gpu/drm/vc4/vc4_drv.h | 2 +
- drivers/gpu/drm/vc4/vc4_validate.c | 17 +++++---
- drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++++++++++
- include/uapi/drm/vc4_drm.h | 1 +
- 5 files changed, 79 insertions(+), 5 deletions(-)
- --- a/drivers/gpu/drm/vc4/vc4_drv.c
- +++ b/drivers/gpu/drm/vc4/vc4_drv.c
- @@ -107,6 +107,7 @@ static int vc4_get_param_ioctl(struct dr
- break;
- case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
- case DRM_VC4_PARAM_SUPPORTS_ETC1:
- + case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
- args->value = true;
- break;
- default:
- --- a/drivers/gpu/drm/vc4/vc4_drv.h
- +++ b/drivers/gpu/drm/vc4/vc4_drv.h
- @@ -395,6 +395,8 @@ struct vc4_validated_shader_info {
-
- uint32_t num_uniform_addr_offsets;
- uint32_t *uniform_addr_offsets;
- +
- + bool is_threaded;
- };
-
- /**
- --- a/drivers/gpu/drm/vc4/vc4_validate.c
- +++ b/drivers/gpu/drm/vc4/vc4_validate.c
- @@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device
- exec->shader_rec_v += roundup(packet_size, 16);
- exec->shader_rec_size -= packet_size;
-
- - if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
- - DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
- - return -EINVAL;
- - }
- -
- for (i = 0; i < shader_reloc_count; i++) {
- if (src_handles[i] > exec->bo_count) {
- DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
- @@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device
- return -EINVAL;
- }
-
- + if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
- + to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
- + DRM_ERROR("Thread mode of CL and FS do not match\n");
- + return -EINVAL;
- + }
- +
- + if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
- + to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
- + DRM_ERROR("cs and vs cannot be threaded\n");
- + return -EINVAL;
- + }
- +
- for (i = 0; i < shader_reloc_count; i++) {
- struct vc4_validated_shader_info *validated_shader;
- uint32_t o = shader_reloc_offsets[i];
- --- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
- +++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
- @@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
- * basic blocks.
- */
- bool needs_uniform_address_for_loop;
- +
- + /* Set when we find an instruction writing the top half of the
- + * register files. If we allowed writing the unusable regs in
- + * a threaded shader, then the other shader running on our
- + * QPU's clamp validation would be invalid.
- + */
- + bool all_registers_used;
- };
-
- static uint32_t
- @@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t i
- }
-
- static bool
- +live_reg_is_upper_half(uint32_t lri)
- +{
- + return (lri >= 16 && lri < 32) ||
- + (lri >= 32 + 16 && lri < 32 + 32);
- +}
- +
- +static bool
- is_tmu_submit(uint32_t waddr)
- {
- return (waddr == QPU_W_TMU0_S ||
- @@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_sha
- } else {
- validation_state->live_immediates[lri] = ~0;
- }
- +
- + if (live_reg_is_upper_half(lri))
- + validation_state->all_registers_used = true;
- }
-
- switch (waddr) {
- @@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_valid
- }
- }
-
- + if ((raddr_a >= 16 && raddr_a < 32) ||
- + (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
- + validation_state->all_registers_used = true;
- + }
- +
- return true;
- }
-
- @@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_o
- {
- bool found_shader_end = false;
- int shader_end_ip = 0;
- + uint32_t last_thread_switch_ip = -3;
- uint32_t ip;
- struct vc4_validated_shader_info *validated_shader = NULL;
- struct vc4_shader_validation_state validation_state;
- @@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_o
- if (!vc4_handle_branch_target(&validation_state))
- goto fail;
-
- + if (ip == last_thread_switch_ip + 3) {
- + /* Reset r0-r3 live clamp data */
- + int i;
- +
- + for (i = 64; i < LIVE_REG_COUNT; i++) {
- + validation_state.live_min_clamp_offsets[i] = ~0;
- + validation_state.live_max_clamp_regs[i] = false;
- + validation_state.live_immediates[i] = ~0;
- + }
- + }
- +
- switch (sig) {
- case QPU_SIG_NONE:
- case QPU_SIG_WAIT_FOR_SCOREBOARD:
- @@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_o
- case QPU_SIG_LOAD_TMU1:
- case QPU_SIG_PROG_END:
- case QPU_SIG_SMALL_IMM:
- + case QPU_SIG_THREAD_SWITCH:
- + case QPU_SIG_LAST_THREAD_SWITCH:
- if (!check_instruction_writes(validated_shader,
- &validation_state)) {
- DRM_ERROR("Bad write at ip %d\n", ip);
- @@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_o
- shader_end_ip = ip;
- }
-
- + if (sig == QPU_SIG_THREAD_SWITCH ||
- + sig == QPU_SIG_LAST_THREAD_SWITCH) {
- + validated_shader->is_threaded = true;
- +
- + if (ip < last_thread_switch_ip + 3) {
- + DRM_ERROR("Thread switch too soon after "
- + "last switch at ip %d\n", ip);
- + goto fail;
- + }
- + last_thread_switch_ip = ip;
- + }
- +
- break;
-
- case QPU_SIG_LOAD_IMM:
- @@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_o
- if (!check_branch(inst, validated_shader,
- &validation_state, ip))
- goto fail;
- +
- + if (ip < last_thread_switch_ip + 3) {
- + DRM_ERROR("Branch in thread switch at ip %d",
- + ip);
- + goto fail;
- + }
- +
- break;
- default:
- DRM_ERROR("Unsupported QPU signal %d at "
- @@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_o
- goto fail;
- }
-
- + /* Might corrupt other thread */
- + if (validated_shader->is_threaded &&
- + validation_state.all_registers_used) {
- + DRM_ERROR("Shader uses threading, but uses the upper "
- + "half of the registers, too\n");
- + goto fail;
- + }
- +
- /* If we did a backwards branch and we haven't emitted a uniforms
- * reset since then, we still need the uniforms stream to have the
- * uniforms address available so that the backwards branch can do its
- --- a/include/uapi/drm/vc4_drm.h
- +++ b/include/uapi/drm/vc4_drm.h
- @@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
- #define DRM_VC4_PARAM_V3D_IDENT2 2
- #define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3
- #define DRM_VC4_PARAM_SUPPORTS_ETC1 4
- +#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5
-
- struct drm_vc4_get_param {
- __u32 param;
|