Issue
file1
chr1 123896 rs0987522 A T
chr5 678452 rs8733521 G C,A
file2
chr1 123896 rs0987522 A T,C,G
chr5 678452 rs8733521 G A,T
output
chr1 123896 rs0987522 A T*,C,G
chr5 678452 rs8733521 G C!,A*,T
If 4th col of file1 is matching with any character is 4th column of file2 (A==>> T,A,G,C) then rows from file2 should be printed as given above in output and matching character from file1 (here T from 1st row column5) should be represented by "" (T)and non-matching character from file1 should be represented by "!" (C!).
awk 'NR==FNR{firstfile[$1,$2,$4];next} ($1,$2,$4) in firstfile' File1 file2
I have used awk but only for (1,2,4) columns. Kindly help for the 5th one matching. 5th columns comma separated characters may have different orders in file1 & file2.
Solution
With your shown samples, please try following awk
program.
awk '
FNR==NR{
arr1[$1,$2,$4]=$5
next
}
{
val=""
delete arr2;delete arr3;delete arr4;delete arr5
num1=split(arr1[$1,$2,$4],arr2,",")
for(i=1;i<=num1;i++){ arr4[arr2[i]] }
num2=split($NF,arr3,",")
}
(($1,$2,$4) in arr1){
for(i=1;i<=num2;i++){
val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!")
if(arr3[i] in arr4){ arr5[arr3[i]] }
}
for(i=1;i<=num1;i++){
if(!(arr2[i] in arr5)){
val=val "," arr2[i]
}
}
$5=val
}
1
' Input_file2 Input_file1
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
FNR==NR{ ##Checking FNR==NR which will be true when file2 is being read.
arr1[$1,$2,$4]=$5 ##Creating arr1 with index of 1st, 2nd and 4th field and value of 5th field.
next ##next will skip all further statements from here.
}
{
val="" ##Nullifying val here.
delete arr2;delete arr3;delete arr4;delete arr5 ##Deleting arrays here.
num1=split(arr1[$1,$2,$4],arr2,",") ##Splitting arr1 with index of $1,$2,$4 here to arr2.
for(i=1;i<=num1;i++){ arr4[arr2[i]] } ##Running loop till num1, creating arr4 with value of arr2 index of i here.
num2=split($NF,arr3,",") ##Splitting current line last field to arr3 with separator of comma here.
}
(($1,$2,$4) in arr1){ ##Checking if $1,$2,$4 of current line are present in arr1 then do following.
for(i=1;i<=num2;i++){ ##Running for loop till num2 here.
val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!") ##Creating val which compares values of file1 and file2 is they are common then add * or add ! of file1 current value(one of the 5th field values).
if(arr3[i] in arr4){ arr5[arr3[i]] } ##If arr3 value is present in arr4 then create arr5 with index of value of arr3 with index of i.
}
for(i=1;i<=num1;i++){ ##Running loop till value of num1 here.
if(!(arr2[i] in arr5)){ ##If value of arr2 is NOT present in arr5(to get values which are already printed common ones of file1, file2) then do following.
val=val "," arr2[i] ##Append arr2 value to val.
}
}
$5=val ##Assign val to 5th field here.
}
1 ##Printing edited/non-edited line here.
' Input_file2 Input_file1 ##Mentioning Input_file names here.
Answered By - RavinderSingh13