|
Page 4 of 7 4. Common Subexpression Elimination Many a times, it happens that the same expression is evaluated at different places in the same program and the values of the operands in the expression do not change in between the two evaluations of that expression. For example, the program may evaluate say a * b at the beginning and at the end. If the values of a and b do not change in between these two evaluations of a * b, then instead of evaluating a * b again at the end, we can save the result of the evaluation of a * b at the beginning in some temporary variable and use it at the end. This will help eliminate redundant computations in the program. This optimization is called as common subexpression elimination. Consider the following program that demonstrates it. 1 : /* test3.c */ 2 : /* common subexpression elimination, and also of constant propagation */ 3 : #include <stdio.h> 4 : 5 : int main() 6 : { 7 : int a, b; 8 : int x, y, z; 9 : scanf("%d %d", &a, &b); 10 : x = a * b; 11 : 12 : if(b >= 4) 13 : { 14 : y = a * b; 15 : z = 0; 16 : } 17 : else 18 : { 19 : z = a * b * 4; 20 : y = 0; 21 : } 22 : 23 : printf("x = %d, y = %d, z = %d\n", x, y, z); 24 : return 0; 25 : } 26 : /* end of test3.c */ 27 : 28 : /* ----------------------------------------------------------------- */ 29 : /* generated unoptimized assembly language code */ 30 : .file "test3.c" 31 : .version "01.01" 32 : gcc2_compiled.: 33 : .section .rodata 34 : .LC0: 35 : .string "%d %d" 36 : .LC1: 37 : .string "x = %d, y = %d, z = %d\n" 38 : .text 39 : .align 4 40 : .globl main 41 : .type main,@function 42 : main: 43 : pushl %ebp /* save EBP */ 44 : movl %esp,%ebp /* EBP = ESP */ 45 : subl $20,%esp /* Create space for 5 variables */ 46 : 47 : /* scanf("%d %d". &a, &b); */ 48 : leal -8(%ebp),%eax 49 : pushl %eax /* push address of b on stack */ 50 : leal -4(%ebp),%eax 51 : pushl %eax /* push address of a on stack */ 52 : pushl $.LC0 /* push address of string */ 53 : call scanf 54 : addl $12,%esp /* stack cleanup after scanf */ 55 : 56 : /* x = a * b; */ 57 : movl -4(%ebp),%eax /* EAX = a */ 58 : imull -8(%ebp),%eax /* EAX = EAX * b = a * b */ 59 : movl %eax,-12(%ebp) /* x = EAX = a * b */ 60 : 61 : /* if( b >= 4)... */ 62 : cmpl $3,-8(%ebp) /* compare b with 3 */ 63 : jle .L2 /* else part at label .L2, if follows */ 64 : 65 : /* y = a * b; */ 66 : movl -4(%ebp),%eax /* EAX = a */ 67 : imull -8(%ebp),%eax /* EAX = EAX * b = a * b */ 68 : movl %eax,-16(%ebp) /* y = EAX = a * b */ 69 : /* z = 0; */ 70 : movl $0,-20(%ebp) 71 : jmp .L3 /* jump over the else part */ 72 : 73 : .p2align 4,,7 74 : .L2: 75 : /* else part begins here */ 76 : 77 : /* z = a * b * 4; */ 78 : movl -4(%ebp),%eax /* EAX = a */ 79 : imull -8(%ebp),%eax /* EAX = EAX * b = a * b */ 80 : leal 0(,%eax,4),%edx /* EDX = EAX*4 + 0 */ 81 : movl %edx,-20(%ebp) /* z = EDX */ 82 : /* y = 0; */ 83 : movl $0,-16(%ebp) 84 : .L3: 85 : /* if..else is over here */ 86 : 87 : /* printf("x = %d, y = %d, z = %d\n", x, y, x); */ 88 : movl -20(%ebp),%eax 89 : pushl %eax /* push address of z on stack */ 90 : movl -16(%ebp),%eax 91 : pushl %eax /* push address of y on stack */ 92 : movl -12(%ebp),%eax 93 : pushl %eax /* push address of x on stack */ 94 : pushl $.LC1 /* address of string */ 95 : call printf 96 : addl $16,%esp /* stack cleanup after printf */ 97 : 98 : /* return 0 */ 99 : xorl %eax,%eax 100 : jmp .L1 101 : .p2align 4,,7 102 : .L1: 103 : leave 104 : ret 105 : .Lfe1: 106 : .size main,.Lfe1-main 107 : .ident "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)" 108 : /* end of unoptimized assembly language code */ 109 : /* --------------------------------------------------------------- */ 110 : 111 : /* --------------------------------------------------------------- */ 112 : /* generated optimized assembly language code */ 113 : .file "test3.c" 114 : .version "01.01" 115 : gcc2_compiled.: 116 : .section .rodata 117 : .LC0: 118 : .string "%d %d" 119 : .LC1: 120 : .string "x = %d, y = %d, z = %d\n" 121 : .text 122 : .align 4 123 : .globl main 124 : .type main,@function 125 : main: 126 : pushl %ebp /* save EBP */ 127 : movl %esp,%ebp /* EBP = ESP */ 128 : subl $8,%esp /* space for just 2 variables on stack */ 129 : 130 : /* scanf("%d %d", &a, &b); */ 131 : leal -4(%ebp),%eax 132 : pushl %eax /* push address of b on stack */ 133 : leal -8(%ebp),%eax 134 : pushl %eax /* push address of a on stack */ 135 : pushl $.LC0 /* address of string */ 136 : call scanf 137 : 138 : /* x = a * b; */ 139 : movl -4(%ebp),%eax /* EAX = b */ 140 : movl %eax,%edx /* EDX = EAX = b */ 141 : imull -8(%ebp),%edx /* EDX = EDX * a = b * a = a * b */ 142 : 143 : addl $12,%esp /* delayed stack cleanup */ 144 : /* if( b >= 4).... */ 145 : cmpl $3,%eax /* compare EAX = b with 3 */ 146 : jle .L17 /* else part from label .L17 */ 147 : 148 : /* y stored in ECX, z in EAX, x in EDX */ 149 : /* y = a * b; */ 150 : movl %edx,%ecx 151 : /* z = 0; */ 152 : xorl %eax,%eax 153 : jmp .L18 /* jump over the else part */ 154 : .p2align 4,,7 155 : .L17: 156 : /* z = a * b * 4; */ 157 : leal 0(,%edx,4),%eax /* LEA EAX, [EDX*4]+0 */ 158 : /* y = 0; */ 159 : xorl %ecx,%ecx 160 : .L18: 161 : pushl %eax /* push value of z */ 162 : pushl %ecx /* push value of y */ 163 : pushl %edx /* push value of x */ 164 : pushl $.LC1 /* push address of string */ 165 : call printf 166 : /* stack cleanup after printf not necessary */ 167 : 168 : /* return 0; */ 169 : xorl %eax,%eax 170 : leave 171 : ret 172 : .Lfe1: 173 : .size main,.Lfe1-main 174 : .ident "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)" 175 : /* end optimized assembly code */ 176 : /* -------------------------------------------------------------- */
This program has an example of a common subexpression. On line 10, the expression a * b is evaluated the first time, and then again on lines 14 and 19. The last two evaluations of a * b are redundant since the value of neither a nor b changes after the first evaluation. Thus these two evaluations are common subexpressions that can be eliminated. Lines 30 to 108 show the unoptimized assembly code that is a straightforward translation of the C code and should be easy to understand. Now consider the optimized assembly language code from lines 113 to 176. The first thing to notice is that now only variables a and b are stored on the stack, hence a stack of just 8 bytes is required as opposed to a 20 byte stack for the unoptimized version. You may wonder what's so special about variables a and b. The specialty is that the address of variables a and b is used in the program (for scanf()) and variables that reside in the registers cannot have a memory address. Hence the compiler is unable to put them inside the registers. The registers that the compiler chooses for other variables are as follows: x in EDX, y in ECX and z in EAX. Statements 139 to 141 evaluate a * b and the value is stored in EDX (which holds x). Also, the value of b is available in the register EAX. One more thing to notice is the delayed cleanup of stack after the call to scanf() on line 143. May be there is some advantage in doing so, I don't know. Next starts the if statement. In the if part, the expression a * b is reevaluated and we expect that the compiler will use the value stored in EDX. That is exactly what the compiler does. For the statement y = a * b, the generated code on line 150 simply copies the value of a * b in register EDX into the register ECX (which holds y). In the else part again, there is a common subexpression in the statement z = a * b * 4. Thus we expect that the compiler will take the value of a * b in EDX, multiply it by 4 and then store the result in EAX (which holds z). Now there are many ways in which this can be done. One is to use a sequence of movl, imull instructions as in movl %edx, %eax /* EAX = EDX = a * b */ imull $4, %eax /* EAX = EAX * 4 */
The second choice is to use a shift (sall) instead of imull in the above sequence of instructions. However, it turns out that the GCC uses an obscure speed trick to optimize the code. It uses the instruction leal 0(,%edx,4), %eax
This instruction uses the scaling and indexing capabilities of the 80386 processors. The above instruction takes EDX as the index register, 4 as the scale and 0 as the offset, calculates the effective address and stores it in the EAX register. Thus the register EAX gets a value EDX(=a*b)*4 + 0 i.e. a * b * 4. The leal instruction always executes in 2 clock cycles on an 80386, whereas a simple shift will take around 7 clock cycles. We see that the compiler knows about processor peculiarities and uses it when it generates the code. Thus it can be seen that common subexpressions save a lot of computations and also space that is used up by the code for these redundant computations. At this stage, one can understand that the compiler when analyzing the program for optimization must keep a track of how and when the variables change, the expressions that are evaluated and also which registers are allocated to variables. In general such a kind of analysis that the compiler performs on the program is called as data flow analysis.
|