## This line of code just demultiplexes the fastq on the basis of the fw indexing barcode encoded in the first 8 bp of R1

module load cutadapt

cutadapt -e 1 -g ^file:fw_barcode.fasta -o "{name}.R1.fastq" -p "{name}.R2.fastq" CROP_VIP_Pool_R1.fastq CROP_VIP_Pool_R2.fastq

# This will loop through all the demultiplexed fastqs and trim the U6 priming site at the 5' end and the scaffold at the 3' end to only leave the 20bp protospacer sequence from the read. 

for fastq in *R1.fastq
do
out=`awk -v fastq=$fastq 'BEGIN{ sub(/.fastq/, "_cut.fastq",fastq); print fastq }'`
echo $out
cutadapt -g TCTTGTGGAAAGGACGAAACACCG -l 20 -o $out $fastq
sleep 0.3
done

# This line of code uses mageck to generate a counts matrix from the trimmed protospacer FASTQs, using the
# full list of protospacers in GuEST_List as a reference. You will need to be in a conda environment with
# mageck and it's dependancies installed to run this

mageck count -l GuEST_List_Protospacer_Reference.csv -n pDNA_Diversity_Check --sample-label Sample1,Sample2 --fastq Sample1.R1_cut.fastq Sample2.R1_cut.fastq