/*
   This file is part of a programme to research fast Fermat factorisation algorithms.

   This is free software: you can redistribute it and/or modify
   it under the terms of version 3 of the GNU General Public License as
   published by the Free Software Foundation.

   It is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public Licence
   along with this programme.  If not, see <https://www.gnu.org/licenses/>.

   A copy of the licence is also available at
   https://www.cambridgeclarion.org/download/GPLv3.txt
   It has the following cryptographic hash
   SHA256 3972dc9744f6499f0f9b2dbf76696f2ae7ad8af9b23dde66d6af86c9dfb36986

*/

/*

    The software in this file was taken from the
    the public domain software YAFU and modified.

    The modifications are copyright 2022 Stephen Hewitt,
    licensed as above.

    The header in the original YAFU file follows:
*/

/*----------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Ben Buhrow. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.

Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may
benefit from your work.	

Some parts of the code (and also this header), included in this
distribution have been reused from other sources. In particular I
have benefitted greatly from the work of Jason Papadopoulos's msieve @
www.boo.net/~jasonp, Scott Contini's mpqs implementation, and Tom St.
Denis Tom's Fast Math library.  Many thanks to their kind donation of
code to the public domain.
       				   --bbuhrow@gmail.com 11/24/09
----------------------------------------------------------------------*/

#include <sys/time.h>
#include <stdlib.h>
#include "gmp.h"
#include "zf.h"

typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned long uint32;
typedef unsigned long long uint64;

#define setbit(a,b) (((a)[(b) >> 3]) |= (nmasks[(b) & 7]))
#define getbit(a,b) (((a)[(b) >> 3]) & (nmasks[(b) & 7]))

int zFermat1(Output *pout, mpz_t gmp_n)
{
    // Fermat's factorization method with a sieve-based improvement
    // provided by 'neonsignal'
    mpz_t a, b2, tmp, multN, a2;
    int i;
    uint64 i64;
    uint32 M = 2 * 2 * 2 * 2 * 3 * 3 * 5 * 5 * 7 * 7; //176400u
    uint32 M1 = 11 * 17 * 23 * 31; //133331u
    uint32 M2 = 13 * 19 * 29 * 37; //265031u
    uint8 *sqr, *sqr1, *sqr2, *mod, *mod1, *mod2;
    uint16 *skip;
    uint32 m, mmn, s, d;
    uint8 masks[8] = {0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f};
    uint8 nmasks[8];
    uint32 iM = 0, iM1 = 0, iM2 = 0;
    uint32 mult = 1; // multiplier not supported for now
    int result;
    struct timeval tv_enter;
    struct timeval tv_start_search;
    struct timeval tv_found;

    gettimeofday(&tv_enter, 0);

    mpz_init(a);
    mpz_init(b2);
    mpz_init(tmp);
    mpz_init(multN);
    mpz_init(a2);

    // apply the user supplied multiplier
    mpz_mul_ui(multN, gmp_n, mult);

    // compute ceil(sqrt(multN))
    mpz_sqrt(a, multN);

    // form b^2
    mpz_mul(b2, a, a);
    mpz_sub(b2, b2, multN);

    // test successive 'a' values using a sieve-based approach.
    // the idea is that not all 'a' values allow a^2 or b^2 to be square.
    // we pre-compute allowable 'a' values modulo various smooth numbers and
    // build tables to allow us to quickly iterate over 'a' values that are
    // more likely to produce squares.
    // init sieve structures
    sqr = (uint8 *)calloc((M / 8 + 1) , sizeof(uint8));
    sqr1 = (uint8 *)calloc((M1 / 8 + 1) , sizeof(uint8));
    sqr2 = (uint8 *)calloc((M2 / 8 + 1) , sizeof(uint8));
    mod = (uint8 *)calloc((M / 8 + 1) , sizeof(uint8));
    mod1 = (uint8 *)calloc((M1 / 8 + 1) , sizeof(uint8));
    mod2 = (uint8 *)calloc((M2 / 8 + 1) , sizeof(uint8));
    skip = (uint16 *)malloc(M * sizeof(uint16));

    // test it.  This will be good enough if |u*p-v*q| < 2 * N^(1/4), where
    // mult = u*v
    if (mpz_perfect_square_p(b2))
        goto found;

    for (i=0; i<8; i++)
        nmasks[i] = ~masks[i];

    // marks locations where squares can occur mod M, M1, M2
    for (i64 = 0; i64 < M; ++i64)
        setbit(sqr, (i64*i64)%M);

    for (i64 = 0; i64 < M1; ++i64)
        setbit(sqr1, (i64*i64)%M1);

    for (i64 = 0; i64 < M2; ++i64)
        setbit(sqr2, (i64*i64)%M2);

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where
    // b^2 can be a square
    m = mpz_mod_ui(tmp, a, M);
    mmn = mpz_mod_ui(tmp, b2, M);
    for (i = 0; i < M; ++i)
    {
        if (getbit(sqr, mmn)) setbit(mod, i);
        mmn = (mmn+m+m+1)%M;
        m = (m+1)%M;
    }

    // we only consider locations where the modular sequence mod M can
    // be square, so compute the distance to the next square location
    // at each possible value of i mod M.
    s = 0;
    d = 0;
    for (i = 0; !getbit(mod,i); ++i)
        ++s;
    for (i = M; i > 0;)
    {
        --i;
        ++s;
        skip[i] = s;
        if (s > d) d = s;
        if (getbit(mod,i)) s = 0;
    }
    //printf("maxSkip = %u\n", d);
    pout->max_skip = d;

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where the
    // modular sequence can be a square mod M1.  These will
    // generally differ from the sequence mod M.
    m = mpz_mod_ui(tmp, a, M1);
    mmn = mpz_mod_ui(tmp, b2, M1);
    for (i = 0; i < M1; ++i)
    {
        if (getbit(sqr1, mmn)) setbit(mod1, i);
        mmn = (mmn+m+m+1)%M1;
        m = (m+1)%M1;
    }

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where the
    // modular sequence can be a square mod M2.  These will
    // generally differ from the sequence mod M or M1.
    m = mpz_mod_ui(tmp, a, M2);
    mmn = mpz_mod_ui(tmp, b2, M2);
    for (i = 0; i < M2; ++i)
    {
        if (getbit(sqr2, mmn)) setbit(mod2, i);
        mmn = (mmn+m+m+1)%M2;
        m = (m+1)%M2;
    }

    gettimeofday(&tv_start_search, 0);

    // loop, checking for perfect squares
    mpz_mul_2exp(a2, a, 1);

    do
    {
        d = 0;
        do
        {
            // skip to the next possible square residue of b*b mod M
            s = skip[iM];

            // remember how far we skipped
            d += s;

            // update the other residue indices
            if ((iM1 += s) >= M1) iM1 -= M1;
            if ((iM2 += s) >= M2) iM2 -= M2;
            if ((iM += s) >= M) iM -= M;

            // continue if either of the other residues indicates
            // non-square.
        } while (!getbit(mod1,iM1) || !getbit(mod2,iM2));

        // form b^2 by incrementing by many factors of 2*a+1
        mpz_add_ui(tmp, a2, d);
        mpz_mul_ui(tmp, tmp, d);
        mpz_add(b2, b2, tmp);

        // accumulate so that we can reset d
        // (and thus keep it single precision)
        mpz_add_ui(a2, a2, d*2);

    } while (!mpz_perfect_square_p(b2));

found:
    gettimeofday(&tv_found, 0);

    pout->setup_secs = elapsed_secs(tv_start_search, tv_enter);
    pout->search_secs = elapsed_secs(tv_found, tv_start_search);

    if ((mpz_size(b2) > 0) && mpz_perfect_square_p(b2))
        result = check_result(b2, gmp_n);
    else
        result = 134; // "fatal b2 is not as expected"

free_and_return:
    mpz_clear(tmp);
    mpz_clear(a);
    mpz_clear(b2);
    mpz_clear(multN);
    mpz_clear(a2);
    free(sqr);
    free(sqr1);
    free(sqr2);
    free(mod);
    free(mod1);
    free(mod2);
    free(skip);

    return result;
}